Initial commit

Discovery mode works. Still working on normal sync.
author: Jeff Huth <jeff.huth@bytecode.io> 2019-11-13 17:03:56 -0800
committer: Jeff Huth <jeff.huth@bytecode.io> 2019-11-13 17:03:56 -0800
commit: 89643ba6fa98db82efd3246805ef801a8bfb5c81 (patch)
tree: 739027b4e827def2db81631c9d6ed58ec2b97809 /tap_google_sheets/schema.py
parent: 5f8005471d3affaaf23489df93a58ca64c3da3ca (diff)
download: tap-google-sheets-89643ba6fa98db82efd3246805ef801a8bfb5c81.tar.gz
tap-google-sheets-89643ba6fa98db82efd3246805ef801a8bfb5c81.tar.zst
tap-google-sheets-89643ba6fa98db82efd3246805ef801a8bfb5c81.zip
1 files changed, 228 insertions, 0 deletions
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py
new file mode 100644
index 0000000..237ab06
--- /dev/null
+++ b/tap_google_sheets/schema.py
@@ -0,0 +1,228 @@
+import os
+import json
+from collections import OrderedDict
+import singer
+from singer import metadata
+from tap_google_sheets.streams import STREAMS
+LOGGER = singer.get_logger()
+# Reference:
+# https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata
+# Convert column index to column letter
+def colnum_string(n):
+    string = ""
+    while n > 0:
+        n, remainder = divmod(n - 1, 26)
+        string = chr(65 + remainder) + string
+    return string
+# Create sheet_metadata_json with columns from sheet
+def get_sheet_schema_columns(sheet, spreadsheet_id, client):
+    sheet_json_schema = OrderedDict()
+    data = next(iter(sheet.get('data', [])), {})
+    row_data = data.get('rowData',[])
+    # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
+    headers = row_data[0].get('values', [])
+    first_values = row_data[1].get('values', [])
+    # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
+    sheet_json_schema['type'] = 'object'
+    sheet_json_schema['additionalProperties'] = False
+    sheet_json_schema = {
+        'type': 'object',
+        'additionalProperties': False,
+        'properties': {
+            '__sdc_spreadsheet_id': {
+                'type': ['null', 'string']
+            },
+            '__sdc_sheet_id': {
+                'type': ['null', 'integer']
+            },
+            '__sdc_row': {
+                'type': ['null', 'integer']
+            }
+        }
+    }
+    header_list = [] # used for checking uniqueness
+    columns = []
+    prior_header = None
+    i = 0
+    skipped = 0
+    # Read column headers until end or 2 consecutive skipped headers
+    for header in headers:
+        # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
+        column_index = i + 1
+        column_letter = colnum_string(column_index)
+        header_value = header.get('formattedValue')
+        if header_value: # NOT skipped
+            column_is_skipped = False
+            skipped = 0
+            column_name = '{}'.format(header_value)
+            if column_name in header_list:
+                raise Exception('DUPLICATE HEADER ERROR: {}'.format(column_name))
+            else:
+                header_list.append(column_name)
+            first_value = first_values[i]
+            # LOGGER.info('first_value[{}] = {}'.format(i, json.dumps(first_value, indent=2, sort_keys=True)))
+            column_effective_value = first_value.get('effectiveValue', {})
+            for key in column_effective_value.keys():
+                if key in ('numberValue', 'stringValue', 'boolValue', 'errorType', 'formulaType'):
+                    column_effective_value_type = key
+            column_number_format = first_values[i].get('effectiveFormat', {}).get('numberFormat', {})
+            column_number_format_type = column_number_format.get('type')
+            # Determine datatype for sheet_json_schema
+            #
+            # column_effective_value_type = numberValue, stringValue, boolValue; INVALID: errorType, formulaType
+            #   Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
+            #
+            # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, TIME, DATE_TIME, SCIENTIFIC
+            #   Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
+            #
+            column_format = None # Default
+            # column_multiple_of = None # Default
+            if column_effective_value_type in ('formulaValue', 'errorValue'):
+                raise Exception('INVALID DATA TYPE ERROR: {}, value: {}'.format(column_name))
+            elif column_effective_value_type == 'stringValue':
+                column_type = ['null', 'string']
+                column_gs_type = 'stringValue'
+            elif column_effective_value_type == 'boolValue':
+                column_type = ['null', 'boolean', 'string']
+                column_gs_type = 'boolValue'
+            elif column_effective_value_type == 'numberValue':
+                if column_number_format_type == 'DATE_TIME':
+                    column_type = ['null', 'string']
+                    column_format = 'date-time'
+                    column_gs_type = 'numberType.DATE_TIME'
+                elif column_number_format_type == 'DATE':
+                    column_type = ['null', 'string']
+                    column_format = 'date'
+                    column_gs_type = 'numberType.DATE'
+                elif column_number_format_type == 'TIME':
+                    column_type = ['null', 'string']
+                    column_format = 'time'
+                    column_gs_type = 'numberType.TIME'
+                elif column_number_format_type == 'TEXT':
+                    column_type = ['null', 'string']
+                    column_gs_type = 'stringValue'
+                else:
+                    column_type = ['null', 'number', 'string']
+                    column_gs_type = 'numberType'
+        else: # skipped
+            column_is_skipped = True
+            skipped = skipped + 1
+            column_index_str = str(column_index).zfill(2)
+            column_name = '__sdc_skip_col_{}'.format(column_index_str)
+            column_type = ['null', 'string']
+            column_format = None
+            column_gs_type = 'stringValue'
+        if skipped >= 2:
+            # skipped = 2 consecutive skipped headers
+            # Remove prior_header column_name
+            sheet_json_schema['properties'].pop(prior_header, None)
+            column_count = i - 1
+            break
+        else:
+            column = {}
+            column = {
+                'columnIndex': column_index,
+                'columnLetter': column_letter,
+                'columnName': column_name,
+                'columnType': column_gs_type,
+                'columnSkipped': column_is_skipped
+            }
+            columns.append(column)
+            sheet_json_schema['properties'][column_name] = column
+            sheet_json_schema['properties'][column_name]['type'] = column_type
+            if column_format:
+                sheet_json_schema['properties'][column_name]['format'] = column_format
+        prior_header = column_name
+        i = i + 1
+    return sheet_json_schema, columns
+def get_sheet_metadata(sheet, spreadsheet_id, client):
+    sheet_id = sheet.get('properties', {}).get('sheetId')
+    sheet_title = sheet.get('properties', {}).get('title')
+    LOGGER.info('sheet_id = {}, sheet_title = {}'.format(sheet_id, sheet_title))
+    stream_name = 'sheet_metadata'
+    stream_metadata = STREAMS.get(stream_name)
+    api = stream_metadata.get('api', 'sheets')
+    params = stream_metadata.get('params', {})
+    querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]).replace('{sheet_title}', sheet_title)
+    path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', spreadsheet_id), querystring)
+    sheet_md_results = client.get(path=path, api=api, endpoint=stream_name)
+    sheet_cols = sheet_md_results.get('sheets')[0]
+    sheet_schema, columns = get_sheet_schema_columns(sheet_cols, spreadsheet_id, client)
+    return sheet_schema, columns
+def get_abs_path(path):
+    return os.path.join(os.path.dirname(os.path.realpath(__file__)), path)
+def get_schemas(client, spreadsheet_id):
+    schemas = {}
+    field_metadata = {}
+    for stream_name, stream_metadata in STREAMS.items():
+        schema_path = get_abs_path('schemas/{}.json'.format(stream_name))
+        with open(schema_path) as file:
+            schema = json.load(file)
+        schemas[stream_name] = schema
+        mdata = metadata.new()
+        # Documentation:
+        # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions
+        # Reference:
+        # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44
+        mdata = metadata.get_standard_metadata(
+            schema=schema,
+            key_properties=stream_metadata.get('key_properties', None),
+            valid_replication_keys=stream_metadata.get('replication_keys', None),
+            replication_method=stream_metadata.get('replication_method', None)
+        )
+        field_metadata[stream_name] = mdata
+        
+        if stream_name == 'spreadsheet_metadata':
+            api = stream_metadata.get('api', 'sheets')
+            params = stream_metadata.get('params', {})
+            querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()])
+            path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', spreadsheet_id), querystring)
+            spreadsheet_md_results = client.get(path=path, params=querystring, api=api, endpoint=stream_name)
+            sheets = spreadsheet_md_results.get('sheets')
+            if sheets:
+                for sheet in sheets:
+                    sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
+                    # LOGGER.info('sheet_schema = {}'.format(json.dumps(sheet_schema, indent=2, sort_keys=True)))
+                    sheet_title = sheet.get('properties', {}).get('title')
+                    schemas[sheet_title] = sheet_schema
+                    sheet_mdata = metadata.new()
+                    sheet_mdata = metadata.get_standard_metadata(
+                        schema=sheet_schema,
+                        key_properties=['__sdc_row'],
+                        valid_replication_keys=None,
+                        replication_method='FULL_TABLE'
+                    )
+                    field_metadata[sheet_title] = sheet_mdata
+            
+    return schemas, field_metadata
author	Jeff Huth <jeff.huth@bytecode.io>	2019-11-13 17:03:56 -0800
committer	Jeff Huth <jeff.huth@bytecode.io>	2019-11-13 17:03:56 -0800
commit	89643ba6fa98db82efd3246805ef801a8bfb5c81 (patch)
tree	739027b4e827def2db81631c9d6ed58ec2b97809 /tap_google_sheets/schema.py
parent	5f8005471d3affaaf23489df93a58ca64c3da3ca (diff)
download	tap-google-sheets-89643ba6fa98db82efd3246805ef801a8bfb5c81.tar.gz tap-google-sheets-89643ba6fa98db82efd3246805ef801a8bfb5c81.tar.zst tap-google-sheets-89643ba6fa98db82efd3246805ef801a8bfb5c81.zip

diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py new file mode 100644 index 0000000..237ab06 --- /dev/null +++ b/tap_google_sheets/schema.py
@@ -0,0 +1,228 @@
	1	import os
	2	import json
	3	from collections import OrderedDict
	4	import singer
	5	from singer import metadata
	6	from tap_google_sheets.streams import STREAMS
	7
	8	LOGGER = singer.get_logger()
	9
	10	# Reference:
	11	# https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata
	12
	13	# Convert column index to column letter
	14	def colnum_string(n):
	15	string = ""
	16	while n > 0:
	17	n, remainder = divmod(n - 1, 26)
	18	string = chr(65 + remainder) + string
	19	return string
	20
	21
	22	# Create sheet_metadata_json with columns from sheet
	23	def get_sheet_schema_columns(sheet, spreadsheet_id, client):
	24	sheet_json_schema = OrderedDict()
	25	data = next(iter(sheet.get('data', [])), {})
	26	row_data = data.get('rowData',[])
	27	# spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
	28
	29	headers = row_data[0].get('values', [])
	30	first_values = row_data[1].get('values', [])
	31	# LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
	32
	33	sheet_json_schema['type'] = 'object'
	34	sheet_json_schema['additionalProperties'] = False
	35	sheet_json_schema = {
	36	'type': 'object',
	37	'additionalProperties': False,
	38	'properties': {
	39	'__sdc_spreadsheet_id': {
	40	'type': ['null', 'string']
	41	},
	42	'__sdc_sheet_id': {
	43	'type': ['null', 'integer']
	44	},
	45	'__sdc_row': {
	46	'type': ['null', 'integer']
	47	}
	48	}
	49	}
	50
	51	header_list = [] # used for checking uniqueness
	52	columns = []
	53	prior_header = None
	54	i = 0
	55	skipped = 0
	56	# Read column headers until end or 2 consecutive skipped headers
	57	for header in headers:
	58	# LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
	59	column_index = i + 1
	60	column_letter = colnum_string(column_index)
	61	header_value = header.get('formattedValue')
	62	if header_value: # NOT skipped
	63	column_is_skipped = False
	64	skipped = 0
	65	column_name = '{}'.format(header_value)
	66	if column_name in header_list:
	67	raise Exception('DUPLICATE HEADER ERROR: {}'.format(column_name))
	68	else:
	69	header_list.append(column_name)
	70
	71	first_value = first_values[i]
	72	# LOGGER.info('first_value[{}] = {}'.format(i, json.dumps(first_value, indent=2, sort_keys=True)))
	73
	74	column_effective_value = first_value.get('effectiveValue', {})
	75	for key in column_effective_value.keys():
	76	if key in ('numberValue', 'stringValue', 'boolValue', 'errorType', 'formulaType'):
	77	column_effective_value_type = key
	78
	79	column_number_format = first_values[i].get('effectiveFormat', {}).get('numberFormat', {})
	80	column_number_format_type = column_number_format.get('type')
	81
	82	# Determine datatype for sheet_json_schema
	83	#
	84	# column_effective_value_type = numberValue, stringValue, boolValue; INVALID: errorType, formulaType
	85	# Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
	86	#
	87	# column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, TIME, DATE_TIME, SCIENTIFIC
	88	# Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
	89	#
	90	column_format = None # Default
	91	# column_multiple_of = None # Default
	92	if column_effective_value_type in ('formulaValue', 'errorValue'):
	93	raise Exception('INVALID DATA TYPE ERROR: {}, value: {}'.format(column_name))
	94	elif column_effective_value_type == 'stringValue':
	95	column_type = ['null', 'string']
	96	column_gs_type = 'stringValue'
	97	elif column_effective_value_type == 'boolValue':
	98	column_type = ['null', 'boolean', 'string']
	99	column_gs_type = 'boolValue'
	100	elif column_effective_value_type == 'numberValue':
	101	if column_number_format_type == 'DATE_TIME':
	102	column_type = ['null', 'string']
	103	column_format = 'date-time'
	104	column_gs_type = 'numberType.DATE_TIME'
	105	elif column_number_format_type == 'DATE':
	106	column_type = ['null', 'string']
	107	column_format = 'date'
	108	column_gs_type = 'numberType.DATE'
	109	elif column_number_format_type == 'TIME':
	110	column_type = ['null', 'string']
	111	column_format = 'time'
	112	column_gs_type = 'numberType.TIME'
	113	elif column_number_format_type == 'TEXT':
	114	column_type = ['null', 'string']
	115	column_gs_type = 'stringValue'
	116	else:
	117	column_type = ['null', 'number', 'string']
	118	column_gs_type = 'numberType'
	119
	120	else: # skipped
	121	column_is_skipped = True
	122	skipped = skipped + 1
	123	column_index_str = str(column_index).zfill(2)
	124	column_name = '__sdc_skip_col_{}'.format(column_index_str)
	125	column_type = ['null', 'string']
	126	column_format = None
	127	column_gs_type = 'stringValue'
	128
	129	if skipped >= 2:
	130	# skipped = 2 consecutive skipped headers
	131	# Remove prior_header column_name
	132	sheet_json_schema['properties'].pop(prior_header, None)
	133	column_count = i - 1
	134	break
	135
	136	else:
	137	column = {}
	138	column = {
	139	'columnIndex': column_index,
	140	'columnLetter': column_letter,
	141	'columnName': column_name,
	142	'columnType': column_gs_type,
	143	'columnSkipped': column_is_skipped
	144	}
	145	columns.append(column)
	146
	147	sheet_json_schema['properties'][column_name] = column
	148	sheet_json_schema['properties'][column_name]['type'] = column_type
	149	if column_format:
	150	sheet_json_schema['properties'][column_name]['format'] = column_format
	151
	152	prior_header = column_name
	153	i = i + 1
	154
	155	return sheet_json_schema, columns
	156
	157
	158	def get_sheet_metadata(sheet, spreadsheet_id, client):
	159	sheet_id = sheet.get('properties', {}).get('sheetId')
	160	sheet_title = sheet.get('properties', {}).get('title')
	161	LOGGER.info('sheet_id = {}, sheet_title = {}'.format(sheet_id, sheet_title))
	162
	163	stream_name = 'sheet_metadata'
	164	stream_metadata = STREAMS.get(stream_name)
	165	api = stream_metadata.get('api', 'sheets')
	166	params = stream_metadata.get('params', {})
	167	querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]).replace('{sheet_title}', sheet_title)
	168	path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', spreadsheet_id), querystring)
	169
	170	sheet_md_results = client.get(path=path, api=api, endpoint=stream_name)
	171	sheet_cols = sheet_md_results.get('sheets')[0]
	172	sheet_schema, columns = get_sheet_schema_columns(sheet_cols, spreadsheet_id, client)
	173
	174	return sheet_schema, columns
	175
	176
	177	def get_abs_path(path):
	178	return os.path.join(os.path.dirname(os.path.realpath(__file__)), path)
	179
	180	def get_schemas(client, spreadsheet_id):
	181	schemas = {}
	182	field_metadata = {}
	183
	184	for stream_name, stream_metadata in STREAMS.items():
	185	schema_path = get_abs_path('schemas/{}.json'.format(stream_name))
	186	with open(schema_path) as file:
	187	schema = json.load(file)
	188	schemas[stream_name] = schema
	189	mdata = metadata.new()
	190
	191	# Documentation:
	192	# https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions
	193	# Reference:
	194	# https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44
	195	mdata = metadata.get_standard_metadata(
	196	schema=schema,
	197	key_properties=stream_metadata.get('key_properties', None),
	198	valid_replication_keys=stream_metadata.get('replication_keys', None),
	199	replication_method=stream_metadata.get('replication_method', None)
	200	)
	201	field_metadata[stream_name] = mdata
	202
	203	if stream_name == 'spreadsheet_metadata':
	204	api = stream_metadata.get('api', 'sheets')
	205	params = stream_metadata.get('params', {})
	206	querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()])
	207	path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', spreadsheet_id), querystring)
	208
	209	spreadsheet_md_results = client.get(path=path, params=querystring, api=api, endpoint=stream_name)
	210
	211	sheets = spreadsheet_md_results.get('sheets')
	212	if sheets:
	213	for sheet in sheets:
	214	sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
	215	# LOGGER.info('sheet_schema = {}'.format(json.dumps(sheet_schema, indent=2, sort_keys=True)))
	216
	217	sheet_title = sheet.get('properties', {}).get('title')
	218	schemas[sheet_title] = sheet_schema
	219	sheet_mdata = metadata.new()
	220	sheet_mdata = metadata.get_standard_metadata(
	221	schema=sheet_schema,
	222	key_properties=['__sdc_row'],
	223	valid_replication_keys=None,
	224	replication_method='FULL_TABLE'
	225	)
	226	field_metadata[sheet_title] = sheet_mdata
	227
	228	return schemas, field_metadata