v.0.0.3 Sync error handling, activate version, documentation (#2)v0.0.3

* v.0.0.2 schema and sync changes Change number json schema to anyOf with multipleOf; skip empty rows; move write_bookmark to end of sync.py * v.0.0.3 Sync activate version and error handling Update README.md documentation. Improved logging and handling of errors and warnings. Better null handling in Discovery and Sync. Fix issues with activate version messages.
author: Jeff Huth <39202799+jeffhuth-bytecode@users.noreply.github.com> 2020-01-09 07:30:53 -0800
committer: Kyle Allan <KAllan357@gmail.com> 2020-01-09 10:30:53 -0500
commit: 43a24cbab1dbc35b893c35b86e34adc0f2fb84e7 (patch)
tree: bcbaae860aad0a94bcc4d27f4804504691401438 /tap_google_sheets/schema.py
parent: 5890b89c1aa7c554235b3cef156b5a5a2c594bec (diff)
download: tap-google-sheets-e746354ec6a62e29923d1a18272b88a57527f172.tar.gz
tap-google-sheets-e746354ec6a62e29923d1a18272b88a57527f172.tar.zst
tap-google-sheets-e746354ec6a62e29923d1a18272b88a57527f172.zip
1 files changed, 41 insertions, 10 deletions
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py
index 243467b..e319c03 100644
--- a/tap_google_sheets/schema.py
+++ b/tap_google_sheets/schema.py
@@ -21,6 +21,7 @@ def colnum_string(num):
 # Create sheet_metadata_json with columns from sheet
 def get_sheet_schema_columns(sheet):
+    sheet_title = sheet.get('properties', {}).get('title')
    sheet_json_schema = OrderedDict()
    data = next(iter(sheet.get('data', [])), {})
    row_data = data.get('rowData', [])
@@ -62,15 +63,34 @@ def get_sheet_schema_columns(sheet):
            skipped = 0
            column_name = '{}'.format(header_value)
            if column_name in header_list:
-                raise Exception('DUPLICATE HEADER ERROR: {}'.format(column_name))
+                raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
+                    sheet_title, column_name, column_letter))
            header_list.append(column_name)
-            first_value = first_values[i]
+            first_value = None
+            try:
+                first_value = first_values[i]
+            except IndexError as err:
+                raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
+                    sheet_title, column_name, column_letter, err))
+            
            column_effective_value = first_value.get('effectiveValue', {})
-            for key in column_effective_value.keys():
-                if key in ('numberValue', 'stringValue', 'boolValue', 'errorType', 'formulaType'):
+            col_val = None
-                    column_effective_value_type = key
+            if column_effective_value == {}:
+                column_effective_value_type = 'stringValue'
+                LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
+                    sheet_title, column_name, column_letter))
+                LOGGER.info('   Setting column datatype to STRING')
+            else:
+                for key, val in column_effective_value.items():
+                    if key in ('numberValue', 'stringValue', 'boolValue'):
+                        column_effective_value_type = key
+                        col_val = str(val)
+                    elif key in ('errorType', 'formulaType'):
+                        col_val = str(val)
+                        raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
+                            sheet_title, column_name, column_letter, key, col_val))
            column_number_format = first_values[i].get('effectiveFormat', {}).get(
                'numberFormat', {})
@@ -87,7 +107,13 @@ def get_sheet_schema_columns(sheet):
            #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
            #
            column_format = None # Default
-            if column_effective_value_type == 'stringValue':
+            if column_effective_value == {}:
+                col_properties = {'type': ['null', 'string']}
+                column_gs_type = 'stringValue'
+                LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
+                        sheet_title, column_name, column_letter))
+                LOGGER.info('   Setting column datatype to STRING')
+            elif column_effective_value_type == 'stringValue':
                col_properties = {'type': ['null', 'string']}
                column_gs_type = 'stringValue'
            elif column_effective_value_type == 'boolValue':
@@ -138,8 +164,8 @@ def get_sheet_schema_columns(sheet):
            else:
                col_properties = {'type': ['null', 'string']}
                column_gs_type = 'unsupportedValue'
-                LOGGER.info('Unsupported data type: {}, value: {}'.format(column_name, \
+                LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
-                    column_effective_value_type))
+                        sheet_title, column_name, column_letter, column_effective_value_type, col_val))
                LOGGER.info('Converting to string.')
        else: # skipped
            column_is_skipped = True
@@ -148,11 +174,16 @@ def get_sheet_schema_columns(sheet):
            column_name = '__sdc_skip_col_{}'.format(column_index_str)
            col_properties = {'type': ['null', 'string']}
            column_gs_type = 'stringValue'
+            LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
+                sheet_title, column_name, column_letter))
+            LOGGER.info('  This column will be skipped during data loading.')
        if skipped >= 2:
            # skipped = 2 consecutive skipped headers
            # Remove prior_header column_name
            sheet_json_schema['properties'].pop(prior_header, None)
+            LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
+                sheet_title, column_name, column_letter))
            break
        else:
@@ -245,7 +276,7 @@ def get_schemas(client, spreadsheet_id):
                for sheet in sheets:
                    # GET sheet_json_schema for each worksheet (from function above)
                    sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
-                    LOGGER.info('columns = {}'.format(columns))
+                    # LOGGER.info('columns = {}'.format(columns))
                    sheet_title = sheet.get('properties', {}).get('title')
                    schemas[sheet_title] = sheet_json_schema
author	Jeff Huth <39202799+jeffhuth-bytecode@users.noreply.github.com>	2020-01-09 07:30:53 -0800
committer	Kyle Allan <KAllan357@gmail.com>	2020-01-09 10:30:53 -0500
commit	43a24cbab1dbc35b893c35b86e34adc0f2fb84e7 (patch)
tree	bcbaae860aad0a94bcc4d27f4804504691401438 /tap_google_sheets/schema.py
parent	5890b89c1aa7c554235b3cef156b5a5a2c594bec (diff)
download	tap-google-sheets-e746354ec6a62e29923d1a18272b88a57527f172.tar.gz tap-google-sheets-e746354ec6a62e29923d1a18272b88a57527f172.tar.zst tap-google-sheets-e746354ec6a62e29923d1a18272b88a57527f172.zip

diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py index 243467b..e319c03 100644 --- a/tap_google_sheets/schema.py +++ b/tap_google_sheets/schema.py
@@ -21,6 +21,7 @@ def colnum_string(num):
21		21
22	# Create sheet_metadata_json with columns from sheet	22	# Create sheet_metadata_json with columns from sheet
23	def get_sheet_schema_columns(sheet):	23	def get_sheet_schema_columns(sheet):
		24	sheet_title = sheet.get('properties', {}).get('title')
24	sheet_json_schema = OrderedDict()	25	sheet_json_schema = OrderedDict()
25	data = next(iter(sheet.get('data', [])), {})	26	data = next(iter(sheet.get('data', [])), {})
26	row_data = data.get('rowData', [])	27	row_data = data.get('rowData', [])
@@ -62,15 +63,34 @@ def get_sheet_schema_columns(sheet):
62	skipped = 0	63	skipped = 0
63	column_name = '{}'.format(header_value)	64	column_name = '{}'.format(header_value)
64	if column_name in header_list:	65	if column_name in header_list:
65	raise Exception('DUPLICATE HEADER ERROR: {}'.format(column_name))	66	raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
		67	sheet_title, column_name, column_letter))
66	header_list.append(column_name)	68	header_list.append(column_name)
67		69
68	first_value = first_values[i]	70	first_value = None
69		71	try:
		72	first_value = first_values[i]
		73	except IndexError as err:
		74	raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
		75	sheet_title, column_name, column_letter, err))
		76
70	column_effective_value = first_value.get('effectiveValue', {})	77	column_effective_value = first_value.get('effectiveValue', {})
71	for key in column_effective_value.keys():	78
72	if key in ('numberValue', 'stringValue', 'boolValue', 'errorType', 'formulaType'):	79	col_val = None
73	column_effective_value_type = key	80	if column_effective_value == {}:
		81	column_effective_value_type = 'stringValue'
		82	LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
		83	sheet_title, column_name, column_letter))
		84	LOGGER.info(' Setting column datatype to STRING')
		85	else:
		86	for key, val in column_effective_value.items():
		87	if key in ('numberValue', 'stringValue', 'boolValue'):
		88	column_effective_value_type = key
		89	col_val = str(val)
		90	elif key in ('errorType', 'formulaType'):
		91	col_val = str(val)
		92	raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
		93	sheet_title, column_name, column_letter, key, col_val))
74		94
75	column_number_format = first_values[i].get('effectiveFormat', {}).get(	95	column_number_format = first_values[i].get('effectiveFormat', {}).get(
76	'numberFormat', {})	96	'numberFormat', {})
@@ -87,7 +107,13 @@ def get_sheet_schema_columns(sheet):
87	# https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType	107	# https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
88	#	108	#
89	column_format = None # Default	109	column_format = None # Default
90	if column_effective_value_type == 'stringValue':	110	if column_effective_value == {}:
		111	col_properties = {'type': ['null', 'string']}
		112	column_gs_type = 'stringValue'
		113	LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
		114	sheet_title, column_name, column_letter))
		115	LOGGER.info(' Setting column datatype to STRING')
		116	elif column_effective_value_type == 'stringValue':
91	col_properties = {'type': ['null', 'string']}	117	col_properties = {'type': ['null', 'string']}
92	column_gs_type = 'stringValue'	118	column_gs_type = 'stringValue'
93	elif column_effective_value_type == 'boolValue':	119	elif column_effective_value_type == 'boolValue':
@@ -138,8 +164,8 @@ def get_sheet_schema_columns(sheet):
138	else:	164	else:
139	col_properties = {'type': ['null', 'string']}	165	col_properties = {'type': ['null', 'string']}
140	column_gs_type = 'unsupportedValue'	166	column_gs_type = 'unsupportedValue'
141	LOGGER.info('Unsupported data type: {}, value: {}'.format(column_name, \	167	LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
142	column_effective_value_type))	168	sheet_title, column_name, column_letter, column_effective_value_type, col_val))
143	LOGGER.info('Converting to string.')	169	LOGGER.info('Converting to string.')
144	else: # skipped	170	else: # skipped
145	column_is_skipped = True	171	column_is_skipped = True
@@ -148,11 +174,16 @@ def get_sheet_schema_columns(sheet):
148	column_name = '__sdc_skip_col_{}'.format(column_index_str)	174	column_name = '__sdc_skip_col_{}'.format(column_index_str)
149	col_properties = {'type': ['null', 'string']}	175	col_properties = {'type': ['null', 'string']}
150	column_gs_type = 'stringValue'	176	column_gs_type = 'stringValue'
		177	LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
		178	sheet_title, column_name, column_letter))
		179	LOGGER.info(' This column will be skipped during data loading.')
151		180
152	if skipped >= 2:	181	if skipped >= 2:
153	# skipped = 2 consecutive skipped headers	182	# skipped = 2 consecutive skipped headers
154	# Remove prior_header column_name	183	# Remove prior_header column_name
155	sheet_json_schema['properties'].pop(prior_header, None)	184	sheet_json_schema['properties'].pop(prior_header, None)
		185	LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
		186	sheet_title, column_name, column_letter))
156	break	187	break
157		188
158	else:	189	else:
@@ -245,7 +276,7 @@ def get_schemas(client, spreadsheet_id):
245	for sheet in sheets:	276	for sheet in sheets:
246	# GET sheet_json_schema for each worksheet (from function above)	277	# GET sheet_json_schema for each worksheet (from function above)
247	sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)	278	sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
248	LOGGER.info('columns = {}'.format(columns))	279	# LOGGER.info('columns = {}'.format(columns))
249		280
250	sheet_title = sheet.get('properties', {}).get('title')	281	sheet_title = sheet.get('properties', {}).get('title')
251	schemas[sheet_title] = sheet_json_schema	282	schemas[sheet_title] = sheet_json_schema