From 376f1145837541d4fff2ad0e499236761f9873c3 Mon Sep 17 00:00:00 2001 From: Jeff Huth <39202799+jeffhuth-bytecode@users.noreply.github.com> Date: Mon, 24 Feb 2020 09:53:26 -0800 Subject: v.0.0.4 Logic to skip empty sheets (#4) * v.0.0.2 schema and sync changes Change number json schema to anyOf with multipleOf; skip empty rows; move write_bookmark to end of sync.py * v.0.0.3 Sync activate version and error handling Update README.md documentation. Improved logging and handling of errors and warnings. Better null handling in Discovery and Sync. Fix issues with activate version messages. * v.0.0.4 Skip empty worksheets Add logic to skip empty worksheets in Discovery and Sync mode. * schema.py fix number datatype issue Nomber datatypes are being created as strings in targets. The JSON schema order needs to be adjusted so that order is null, number, string. --- tap_google_sheets/schema.py | 365 ++++++++++++++++++++++---------------------- 1 file changed, 185 insertions(+), 180 deletions(-) (limited to 'tap_google_sheets/schema.py') diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py index e319c03..c229d72 100644 --- a/tap_google_sheets/schema.py +++ b/tap_google_sheets/schema.py @@ -25,184 +25,188 @@ def get_sheet_schema_columns(sheet): sheet_json_schema = OrderedDict() data = next(iter(sheet.get('data', [])), {}) row_data = data.get('rowData', []) - # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse - - headers = row_data[0].get('values', []) - first_values = row_data[1].get('values', []) - # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) - - sheet_json_schema = { - 'type': 'object', - 'additionalProperties': False, - 'properties': { - '__sdc_spreadsheet_id': { - 'type': ['null', 'string'] - }, - '__sdc_sheet_id': { - 'type': ['null', 'integer'] - }, - '__sdc_row': { - 'type': ['null', 'integer'] + if row_data == []: + # Empty sheet, SKIP + LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title)) + return None, None + else: + # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse + headers = row_data[0].get('values', []) + first_values = row_data[1].get('values', []) + # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) + + sheet_json_schema = { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + '__sdc_spreadsheet_id': { + 'type': ['null', 'string'] + }, + '__sdc_sheet_id': { + 'type': ['null', 'integer'] + }, + '__sdc_row': { + 'type': ['null', 'integer'] + } } } - } - - header_list = [] # used for checking uniqueness - columns = [] - prior_header = None - i = 0 - skipped = 0 - # Read column headers until end or 2 consecutive skipped headers - for header in headers: - # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True))) - column_index = i + 1 - column_letter = colnum_string(column_index) - header_value = header.get('formattedValue') - if header_value: # NOT skipped - column_is_skipped = False - skipped = 0 - column_name = '{}'.format(header_value) - if column_name in header_list: - raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format( - sheet_title, column_name, column_letter)) - header_list.append(column_name) - - first_value = None - try: - first_value = first_values[i] - except IndexError as err: - raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format( - sheet_title, column_name, column_letter, err)) - - column_effective_value = first_value.get('effectiveValue', {}) - - col_val = None - if column_effective_value == {}: - column_effective_value_type = 'stringValue' - LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format( - sheet_title, column_name, column_letter)) - LOGGER.info(' Setting column datatype to STRING') - else: - for key, val in column_effective_value.items(): - if key in ('numberValue', 'stringValue', 'boolValue'): - column_effective_value_type = key - col_val = str(val) - elif key in ('errorType', 'formulaType'): - col_val = str(val) - raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( - sheet_title, column_name, column_letter, key, col_val)) - - column_number_format = first_values[i].get('effectiveFormat', {}).get( - 'numberFormat', {}) - column_number_format_type = column_number_format.get('type') - - # Determine datatype for sheet_json_schema - # - # column_effective_value_type = numberValue, stringValue, boolValue; - # INVALID: errorType, formulaType - # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue - # - # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, - # TIME, DATE_TIME, SCIENTIFIC - # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType - # - column_format = None # Default - if column_effective_value == {}: - col_properties = {'type': ['null', 'string']} - column_gs_type = 'stringValue' - LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format( + + header_list = [] # used for checking uniqueness + columns = [] + prior_header = None + i = 0 + skipped = 0 + # Read column headers until end or 2 consecutive skipped headers + for header in headers: + # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True))) + column_index = i + 1 + column_letter = colnum_string(column_index) + header_value = header.get('formattedValue') + if header_value: # NOT skipped + column_is_skipped = False + skipped = 0 + column_name = '{}'.format(header_value) + if column_name in header_list: + raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format( sheet_title, column_name, column_letter)) - LOGGER.info(' Setting column datatype to STRING') - elif column_effective_value_type == 'stringValue': - col_properties = {'type': ['null', 'string']} - column_gs_type = 'stringValue' - elif column_effective_value_type == 'boolValue': - col_properties = {'type': ['null', 'boolean', 'string']} - column_gs_type = 'boolValue' - elif column_effective_value_type == 'numberValue': - if column_number_format_type == 'DATE_TIME': - col_properties = { - 'type': ['null', 'string'], - 'format': 'date-time' - } - column_gs_type = 'numberType.DATE_TIME' - elif column_number_format_type == 'DATE': - col_properties = { - 'type': ['null', 'string'], - 'format': 'date' - } - column_gs_type = 'numberType.DATE' - elif column_number_format_type == 'TIME': - col_properties = { - 'type': ['null', 'string'], - 'format': 'time' - } - column_gs_type = 'numberType.TIME' - elif column_number_format_type == 'TEXT': + header_list.append(column_name) + + first_value = None + try: + first_value = first_values[i] + except IndexError as err: + raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format( + sheet_title, column_name, column_letter, err)) + + column_effective_value = first_value.get('effectiveValue', {}) + + col_val = None + if column_effective_value == {}: + column_effective_value_type = 'stringValue' + LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format( + sheet_title, column_name, column_letter)) + LOGGER.info(' Setting column datatype to STRING') + else: + for key, val in column_effective_value.items(): + if key in ('numberValue', 'stringValue', 'boolValue'): + column_effective_value_type = key + col_val = str(val) + elif key in ('errorType', 'formulaType'): + col_val = str(val) + raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( + sheet_title, column_name, column_letter, key, col_val)) + + column_number_format = first_values[i].get('effectiveFormat', {}).get( + 'numberFormat', {}) + column_number_format_type = column_number_format.get('type') + + # Determine datatype for sheet_json_schema + # + # column_effective_value_type = numberValue, stringValue, boolValue; + # INVALID: errorType, formulaType + # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue + # + # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, + # TIME, DATE_TIME, SCIENTIFIC + # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType + # + column_format = None # Default + if column_effective_value == {}: + col_properties = {'type': ['null', 'string']} + column_gs_type = 'stringValue' + LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format( + sheet_title, column_name, column_letter)) + LOGGER.info(' Setting column datatype to STRING') + elif column_effective_value_type == 'stringValue': col_properties = {'type': ['null', 'string']} column_gs_type = 'stringValue' + elif column_effective_value_type == 'boolValue': + col_properties = {'type': ['null', 'boolean', 'string']} + column_gs_type = 'boolValue' + elif column_effective_value_type == 'numberValue': + if column_number_format_type == 'DATE_TIME': + col_properties = { + 'type': ['null', 'string'], + 'format': 'date-time' + } + column_gs_type = 'numberType.DATE_TIME' + elif column_number_format_type == 'DATE': + col_properties = { + 'type': ['null', 'string'], + 'format': 'date' + } + column_gs_type = 'numberType.DATE' + elif column_number_format_type == 'TIME': + col_properties = { + 'type': ['null', 'string'], + 'format': 'time' + } + column_gs_type = 'numberType.TIME' + elif column_number_format_type == 'TEXT': + col_properties = {'type': ['null', 'string']} + column_gs_type = 'stringValue' + else: + # Interesting - order in the anyOf makes a difference. + # Number w/ multipleOf must be listed last, otherwise errors occur. + col_properties = { + 'anyOf': [ + { + 'type': 'null' + }, + { + 'type': 'number', + 'multipleOf': 1e-15 + }, + { + 'type': 'string' + } + ] + } + column_gs_type = 'numberType' + # Catch-all to deal with other types and set to string + # column_effective_value_type: formulaValue, errorValue, or other else: - # Interesting - order in the anyOf makes a difference. - # Number w/ multipleOf must be listed last, otherwise errors occur. - col_properties = { - 'anyOf': [ - { - 'type': 'string' - }, - { - 'type': 'null' - }, - { - 'type': 'number', - 'multipleOf': 1e-15 - } - ] - } - column_gs_type = 'numberType' - # Catch-all to deal with other types and set to string - # column_effective_value_type: formulaValue, errorValue, or other - else: + col_properties = {'type': ['null', 'string']} + column_gs_type = 'unsupportedValue' + LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( + sheet_title, column_name, column_letter, column_effective_value_type, col_val)) + LOGGER.info('Converting to string.') + else: # skipped + column_is_skipped = True + skipped = skipped + 1 + column_index_str = str(column_index).zfill(2) + column_name = '__sdc_skip_col_{}'.format(column_index_str) col_properties = {'type': ['null', 'string']} - column_gs_type = 'unsupportedValue' - LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( - sheet_title, column_name, column_letter, column_effective_value_type, col_val)) - LOGGER.info('Converting to string.') - else: # skipped - column_is_skipped = True - skipped = skipped + 1 - column_index_str = str(column_index).zfill(2) - column_name = '__sdc_skip_col_{}'.format(column_index_str) - col_properties = {'type': ['null', 'string']} - column_gs_type = 'stringValue' - LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format( - sheet_title, column_name, column_letter)) - LOGGER.info(' This column will be skipped during data loading.') - - if skipped >= 2: - # skipped = 2 consecutive skipped headers - # Remove prior_header column_name - sheet_json_schema['properties'].pop(prior_header, None) - LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format( - sheet_title, column_name, column_letter)) - break - - else: - column = {} - column = { - 'columnIndex': column_index, - 'columnLetter': column_letter, - 'columnName': column_name, - 'columnType': column_gs_type, - 'columnSkipped': column_is_skipped - } - columns.append(column) + column_gs_type = 'stringValue' + LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format( + sheet_title, column_name, column_letter)) + LOGGER.info(' This column will be skipped during data loading.') - sheet_json_schema['properties'][column_name] = col_properties + if skipped >= 2: + # skipped = 2 consecutive skipped headers + # Remove prior_header column_name + sheet_json_schema['properties'].pop(prior_header, None) + LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format( + sheet_title, column_name, column_letter)) + break - prior_header = column_name - i = i + 1 + else: + column = {} + column = { + 'columnIndex': column_index, + 'columnLetter': column_letter, + 'columnName': column_name, + 'columnType': column_gs_type, + 'columnSkipped': column_is_skipped + } + columns.append(column) - return sheet_json_schema, columns + sheet_json_schema['properties'][column_name] = col_properties + + prior_header = column_name + i = i + 1 + + return sheet_json_schema, columns # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query @@ -276,17 +280,18 @@ def get_schemas(client, spreadsheet_id): for sheet in sheets: # GET sheet_json_schema for each worksheet (from function above) sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) - # LOGGER.info('columns = {}'.format(columns)) - - sheet_title = sheet.get('properties', {}).get('title') - schemas[sheet_title] = sheet_json_schema - sheet_mdata = metadata.new() - sheet_mdata = metadata.get_standard_metadata( - schema=sheet_json_schema, - key_properties=['__sdc_row'], - valid_replication_keys=None, - replication_method='FULL_TABLE' - ) - field_metadata[sheet_title] = sheet_mdata + + # SKIP empty sheets (where sheet_json_schema and columns are None) + if sheet_json_schema and columns: + sheet_title = sheet.get('properties', {}).get('title') + schemas[sheet_title] = sheet_json_schema + sheet_mdata = metadata.new() + sheet_mdata = metadata.get_standard_metadata( + schema=sheet_json_schema, + key_properties=['__sdc_row'], + valid_replication_keys=None, + replication_method='FULL_TABLE' + ) + field_metadata[sheet_title] = sheet_mdata return schemas, field_metadata -- cgit v1.2.3