From f6d7c509f00feb63810df7a9216fe1769195197a Mon Sep 17 00:00:00 2001 From: cosimon Date: Fri, 17 Apr 2020 09:09:41 -0400 Subject: Skip malformed sheets (#13) * Skip malformed sheets * Move the try except block up a level for readability sake * Remove debug code --- tap_google_sheets/schema.py | 339 ++++++++++++++++++++++---------------------- 1 file changed, 171 insertions(+), 168 deletions(-) (limited to 'tap_google_sheets') diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py index c229d72..3e63e75 100644 --- a/tap_google_sheets/schema.py +++ b/tap_google_sheets/schema.py @@ -29,184 +29,183 @@ def get_sheet_schema_columns(sheet): # Empty sheet, SKIP LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title)) return None, None - else: - # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse - headers = row_data[0].get('values', []) - first_values = row_data[1].get('values', []) - # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) - - sheet_json_schema = { - 'type': 'object', - 'additionalProperties': False, - 'properties': { - '__sdc_spreadsheet_id': { - 'type': ['null', 'string'] - }, - '__sdc_sheet_id': { - 'type': ['null', 'integer'] - }, - '__sdc_row': { - 'type': ['null', 'integer'] - } + # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse + headers = row_data[0].get('values', []) + first_values = row_data[1].get('values', []) + # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) + + sheet_json_schema = { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + '__sdc_spreadsheet_id': { + 'type': ['null', 'string'] + }, + '__sdc_sheet_id': { + 'type': ['null', 'integer'] + }, + '__sdc_row': { + 'type': ['null', 'integer'] } } + } + + header_list = [] # used for checking uniqueness + columns = [] + prior_header = None + i = 0 + skipped = 0 + # Read column headers until end or 2 consecutive skipped headers + for header in headers: + # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True))) + column_index = i + 1 + column_letter = colnum_string(column_index) + header_value = header.get('formattedValue') + if header_value: # NOT skipped + column_is_skipped = False + skipped = 0 + column_name = '{}'.format(header_value) + if column_name in header_list: + raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format( + sheet_title, column_name, column_letter)) + header_list.append(column_name) - header_list = [] # used for checking uniqueness - columns = [] - prior_header = None - i = 0 - skipped = 0 - # Read column headers until end or 2 consecutive skipped headers - for header in headers: - # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True))) - column_index = i + 1 - column_letter = colnum_string(column_index) - header_value = header.get('formattedValue') - if header_value: # NOT skipped - column_is_skipped = False - skipped = 0 - column_name = '{}'.format(header_value) - if column_name in header_list: - raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format( - sheet_title, column_name, column_letter)) - header_list.append(column_name) - - first_value = None - try: - first_value = first_values[i] - except IndexError as err: - raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format( - sheet_title, column_name, column_letter, err)) - - column_effective_value = first_value.get('effectiveValue', {}) - - col_val = None - if column_effective_value == {}: - column_effective_value_type = 'stringValue' - LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format( + first_value = None + try: + first_value = first_values[i] + except IndexError as err: + raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format( + sheet_title, column_name, column_letter, err)) + + column_effective_value = first_value.get('effectiveValue', {}) + + col_val = None + if column_effective_value == {}: + column_effective_value_type = 'stringValue' + LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format( + sheet_title, column_name, column_letter)) + LOGGER.info(' Setting column datatype to STRING') + else: + for key, val in column_effective_value.items(): + if key in ('numberValue', 'stringValue', 'boolValue'): + column_effective_value_type = key + col_val = str(val) + elif key in ('errorType', 'formulaType'): + col_val = str(val) + raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( + sheet_title, column_name, column_letter, key, col_val)) + + column_number_format = first_values[i].get('effectiveFormat', {}).get( + 'numberFormat', {}) + column_number_format_type = column_number_format.get('type') + + # Determine datatype for sheet_json_schema + # + # column_effective_value_type = numberValue, stringValue, boolValue; + # INVALID: errorType, formulaType + # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue + # + # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, + # TIME, DATE_TIME, SCIENTIFIC + # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType + # + column_format = None # Default + if column_effective_value == {}: + col_properties = {'type': ['null', 'string']} + column_gs_type = 'stringValue' + LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format( sheet_title, column_name, column_letter)) - LOGGER.info(' Setting column datatype to STRING') - else: - for key, val in column_effective_value.items(): - if key in ('numberValue', 'stringValue', 'boolValue'): - column_effective_value_type = key - col_val = str(val) - elif key in ('errorType', 'formulaType'): - col_val = str(val) - raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( - sheet_title, column_name, column_letter, key, col_val)) - - column_number_format = first_values[i].get('effectiveFormat', {}).get( - 'numberFormat', {}) - column_number_format_type = column_number_format.get('type') - - # Determine datatype for sheet_json_schema - # - # column_effective_value_type = numberValue, stringValue, boolValue; - # INVALID: errorType, formulaType - # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue - # - # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, - # TIME, DATE_TIME, SCIENTIFIC - # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType - # - column_format = None # Default - if column_effective_value == {}: - col_properties = {'type': ['null', 'string']} - column_gs_type = 'stringValue' - LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format( - sheet_title, column_name, column_letter)) - LOGGER.info(' Setting column datatype to STRING') - elif column_effective_value_type == 'stringValue': + LOGGER.info(' Setting column datatype to STRING') + elif column_effective_value_type == 'stringValue': + col_properties = {'type': ['null', 'string']} + column_gs_type = 'stringValue' + elif column_effective_value_type == 'boolValue': + col_properties = {'type': ['null', 'boolean', 'string']} + column_gs_type = 'boolValue' + elif column_effective_value_type == 'numberValue': + if column_number_format_type == 'DATE_TIME': + col_properties = { + 'type': ['null', 'string'], + 'format': 'date-time' + } + column_gs_type = 'numberType.DATE_TIME' + elif column_number_format_type == 'DATE': + col_properties = { + 'type': ['null', 'string'], + 'format': 'date' + } + column_gs_type = 'numberType.DATE' + elif column_number_format_type == 'TIME': + col_properties = { + 'type': ['null', 'string'], + 'format': 'time' + } + column_gs_type = 'numberType.TIME' + elif column_number_format_type == 'TEXT': col_properties = {'type': ['null', 'string']} column_gs_type = 'stringValue' - elif column_effective_value_type == 'boolValue': - col_properties = {'type': ['null', 'boolean', 'string']} - column_gs_type = 'boolValue' - elif column_effective_value_type == 'numberValue': - if column_number_format_type == 'DATE_TIME': - col_properties = { - 'type': ['null', 'string'], - 'format': 'date-time' - } - column_gs_type = 'numberType.DATE_TIME' - elif column_number_format_type == 'DATE': - col_properties = { - 'type': ['null', 'string'], - 'format': 'date' - } - column_gs_type = 'numberType.DATE' - elif column_number_format_type == 'TIME': - col_properties = { - 'type': ['null', 'string'], - 'format': 'time' - } - column_gs_type = 'numberType.TIME' - elif column_number_format_type == 'TEXT': - col_properties = {'type': ['null', 'string']} - column_gs_type = 'stringValue' - else: - # Interesting - order in the anyOf makes a difference. - # Number w/ multipleOf must be listed last, otherwise errors occur. - col_properties = { - 'anyOf': [ - { - 'type': 'null' - }, - { - 'type': 'number', - 'multipleOf': 1e-15 - }, - { - 'type': 'string' - } - ] - } - column_gs_type = 'numberType' - # Catch-all to deal with other types and set to string - # column_effective_value_type: formulaValue, errorValue, or other else: - col_properties = {'type': ['null', 'string']} - column_gs_type = 'unsupportedValue' - LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( - sheet_title, column_name, column_letter, column_effective_value_type, col_val)) - LOGGER.info('Converting to string.') - else: # skipped - column_is_skipped = True - skipped = skipped + 1 - column_index_str = str(column_index).zfill(2) - column_name = '__sdc_skip_col_{}'.format(column_index_str) - col_properties = {'type': ['null', 'string']} - column_gs_type = 'stringValue' - LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format( - sheet_title, column_name, column_letter)) - LOGGER.info(' This column will be skipped during data loading.') - - if skipped >= 2: - # skipped = 2 consecutive skipped headers - # Remove prior_header column_name - sheet_json_schema['properties'].pop(prior_header, None) - LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format( - sheet_title, column_name, column_letter)) - break - + # Interesting - order in the anyOf makes a difference. + # Number w/ multipleOf must be listed last, otherwise errors occur. + col_properties = { + 'anyOf': [ + { + 'type': 'null' + }, + { + 'type': 'number', + 'multipleOf': 1e-15 + }, + { + 'type': 'string' + } + ] + } + column_gs_type = 'numberType' + # Catch-all to deal with other types and set to string + # column_effective_value_type: formulaValue, errorValue, or other else: - column = {} - column = { - 'columnIndex': column_index, - 'columnLetter': column_letter, - 'columnName': column_name, - 'columnType': column_gs_type, - 'columnSkipped': column_is_skipped - } - columns.append(column) + col_properties = {'type': ['null', 'string']} + column_gs_type = 'unsupportedValue' + LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( + sheet_title, column_name, column_letter, column_effective_value_type, col_val)) + LOGGER.info('Converting to string.') + else: # skipped + column_is_skipped = True + skipped = skipped + 1 + column_index_str = str(column_index).zfill(2) + column_name = '__sdc_skip_col_{}'.format(column_index_str) + col_properties = {'type': ['null', 'string']} + column_gs_type = 'stringValue' + LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format( + sheet_title, column_name, column_letter)) + LOGGER.info(' This column will be skipped during data loading.') + + if skipped >= 2: + # skipped = 2 consecutive skipped headers + # Remove prior_header column_name + sheet_json_schema['properties'].pop(prior_header, None) + LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format( + sheet_title, column_name, column_letter)) + break + + else: + column = {} + column = { + 'columnIndex': column_index, + 'columnLetter': column_letter, + 'columnName': column_name, + 'columnType': column_gs_type, + 'columnSkipped': column_is_skipped + } + columns.append(column) - sheet_json_schema['properties'][column_name] = col_properties + sheet_json_schema['properties'][column_name] = col_properties - prior_header = column_name - i = i + 1 + prior_header = column_name + i = i + 1 - return sheet_json_schema, columns + return sheet_json_schema, columns # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query @@ -232,7 +231,11 @@ def get_sheet_metadata(sheet, spreadsheet_id, client): sheet_metadata = sheet_md_results.get('sheets')[0] # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results) - sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata) + try: + sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata) + except: + LOGGER.info('SKIPPING Malformed sheet: {}'.format(sheet_title)) + sheet_json_schema, columns = None, None return sheet_json_schema, columns -- cgit v1.2.3