]> git.immae.eu Git - github/fretlink/tap-google-sheets.git/blobdiff - tap_google_sheets/schema.py
Skip malformed sheets (#13)
[github/fretlink/tap-google-sheets.git] / tap_google_sheets / schema.py
index 237ab06f9cea44deaa3225a75d286e8852615652..3e63e7597409859e323eab11dfdb0d1ee3b58a45 100644 (file)
@@ -11,27 +11,29 @@ LOGGER = singer.get_logger()
 # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata
 
 # Convert column index to column letter
-def colnum_string(n):
+def colnum_string(num):
     string = ""
-    while n > 0:
-        n, remainder = divmod(n - 1, 26)
+    while num > 0:
+        num, remainder = divmod(num - 1, 26)
         string = chr(65 + remainder) + string
     return string
 
 
 # Create sheet_metadata_json with columns from sheet
-def get_sheet_schema_columns(sheet, spreadsheet_id, client):
+def get_sheet_schema_columns(sheet):
+    sheet_title = sheet.get('properties', {}).get('title')
     sheet_json_schema = OrderedDict()
     data = next(iter(sheet.get('data', [])), {})
-    row_data = data.get('rowData',[])
+    row_data = data.get('rowData', [])
+    if row_data == []:
+        # Empty sheet, SKIP
+        LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title))
+        return None, None
     # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
-
     headers = row_data[0].get('values', [])
     first_values = row_data[1].get('values', [])
     # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
 
-    sheet_json_schema['type'] = 'object'
-    sheet_json_schema['additionalProperties'] = False
     sheet_json_schema = {
         'type': 'object',
         'additionalProperties': False,
@@ -64,73 +66,127 @@ def get_sheet_schema_columns(sheet, spreadsheet_id, client):
             skipped = 0
             column_name = '{}'.format(header_value)
             if column_name in header_list:
-                raise Exception('DUPLICATE HEADER ERROR: {}'.format(column_name))
-            else:
-                header_list.append(column_name)
+                raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
+                    sheet_title, column_name, column_letter))
+            header_list.append(column_name)
 
-            first_value = first_values[i]
-            # LOGGER.info('first_value[{}] = {}'.format(i, json.dumps(first_value, indent=2, sort_keys=True)))
+            first_value = None
+            try:
+                first_value = first_values[i]
+            except IndexError as err:
+                raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
+                    sheet_title, column_name, column_letter, err))
 
             column_effective_value = first_value.get('effectiveValue', {})
-            for key in column_effective_value.keys():
-                if key in ('numberValue', 'stringValue', 'boolValue', 'errorType', 'formulaType'):
-                    column_effective_value_type = key
 
-            column_number_format = first_values[i].get('effectiveFormat', {}).get('numberFormat', {})
+            col_val = None
+            if column_effective_value == {}:
+                column_effective_value_type = 'stringValue'
+                LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
+                    sheet_title, column_name, column_letter))
+                LOGGER.info('   Setting column datatype to STRING')
+            else:
+                for key, val in column_effective_value.items():
+                    if key in ('numberValue', 'stringValue', 'boolValue'):
+                        column_effective_value_type = key
+                        col_val = str(val)
+                    elif key in ('errorType', 'formulaType'):
+                        col_val = str(val)
+                        raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
+                            sheet_title, column_name, column_letter, key, col_val))
+
+            column_number_format = first_values[i].get('effectiveFormat', {}).get(
+                'numberFormat', {})
             column_number_format_type = column_number_format.get('type')
 
             # Determine datatype for sheet_json_schema
             #
-            # column_effective_value_type = numberValue, stringValue, boolValue; INVALID: errorType, formulaType
-            #   Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
+            # column_effective_value_type = numberValue, stringValue, boolValue;
+            #  INVALID: errorType, formulaType
+            #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
             #
-            # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, TIME, DATE_TIME, SCIENTIFIC
-            #   Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
+            # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
+            #   TIME, DATE_TIME, SCIENTIFIC
+            #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
             #
             column_format = None # Default
-            # column_multiple_of = None # Default
-            if column_effective_value_type in ('formulaValue', 'errorValue'):
-                raise Exception('INVALID DATA TYPE ERROR: {}, value: {}'.format(column_name))
+            if column_effective_value == {}:
+                col_properties = {'type': ['null', 'string']}
+                column_gs_type = 'stringValue'
+                LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
+                        sheet_title, column_name, column_letter))
+                LOGGER.info('   Setting column datatype to STRING')
             elif column_effective_value_type == 'stringValue':
-                column_type = ['null', 'string']
+                col_properties = {'type': ['null', 'string']}
                 column_gs_type = 'stringValue'
             elif column_effective_value_type == 'boolValue':
-                column_type = ['null', 'boolean', 'string']
+                col_properties = {'type': ['null', 'boolean', 'string']}
                 column_gs_type = 'boolValue'
             elif column_effective_value_type == 'numberValue':
                 if column_number_format_type == 'DATE_TIME':
-                    column_type = ['null', 'string']
-                    column_format = 'date-time'
+                    col_properties = {
+                        'type': ['null', 'string'],
+                        'format': 'date-time'
+                    }
                     column_gs_type = 'numberType.DATE_TIME'
                 elif column_number_format_type == 'DATE':
-                    column_type = ['null', 'string']
-                    column_format = 'date'
+                    col_properties = {
+                        'type': ['null', 'string'],
+                        'format': 'date'
+                    }
                     column_gs_type = 'numberType.DATE'
                 elif column_number_format_type == 'TIME':
-                    column_type = ['null', 'string']
-                    column_format = 'time'
+                    col_properties = {
+                        'type': ['null', 'string'],
+                        'format': 'time'
+                    }
                     column_gs_type = 'numberType.TIME'
                 elif column_number_format_type == 'TEXT':
-                    column_type = ['null', 'string']
+                    col_properties = {'type': ['null', 'string']}
                     column_gs_type = 'stringValue'
                 else:
-                    column_type = ['null', 'number', 'string']
+                    # Interesting - order in the anyOf makes a difference.
+                    # Number w/ multipleOf must be listed last, otherwise errors occur.
+                    col_properties =  {
+                        'anyOf': [
+                            {
+                                'type': 'null'
+                            },
+                            {
+                                'type': 'number',
+                                'multipleOf': 1e-15
+                            },
+                            {
+                                'type': 'string'
+                            }
+                        ]
+                    }
                     column_gs_type = 'numberType'
-
+            # Catch-all to deal with other types and set to string
+            # column_effective_value_type: formulaValue, errorValue, or other
+            else:
+                col_properties = {'type': ['null', 'string']}
+                column_gs_type = 'unsupportedValue'
+                LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
+                        sheet_title, column_name, column_letter, column_effective_value_type, col_val))
+                LOGGER.info('Converting to string.')
         else: # skipped
             column_is_skipped = True
             skipped = skipped + 1
             column_index_str = str(column_index).zfill(2)
             column_name = '__sdc_skip_col_{}'.format(column_index_str)
-            column_type = ['null', 'string']
-            column_format = None
+            col_properties = {'type': ['null', 'string']}
             column_gs_type = 'stringValue'
+            LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
+                sheet_title, column_name, column_letter))
+            LOGGER.info('  This column will be skipped during data loading.')
 
         if skipped >= 2:
             # skipped = 2 consecutive skipped headers
             # Remove prior_header column_name
             sheet_json_schema['properties'].pop(prior_header, None)
-            column_count = i - 1
+            LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
+                sheet_title, column_name, column_letter))
             break
 
         else:
@@ -144,10 +200,7 @@ def get_sheet_schema_columns(sheet, spreadsheet_id, client):
             }
             columns.append(column)
 
-            sheet_json_schema['properties'][column_name] = column
-            sheet_json_schema['properties'][column_name]['type'] = column_type
-            if column_format:
-                sheet_json_schema['properties'][column_name]['format'] = column_format
+            sheet_json_schema['properties'][column_name] = col_properties
 
         prior_header = column_name
         i = i + 1
@@ -155,6 +208,10 @@ def get_sheet_schema_columns(sheet, spreadsheet_id, client):
     return sheet_json_schema, columns
 
 
+# Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query
+#   endpoint: spreadsheets/{spreadsheet_id}
+#   params: includeGridData = true, ranges = '{sheet_title}'!1:2
+# This endpoint includes detailed metadata about each cell - incl. data type, formatting, etc.
 def get_sheet_metadata(sheet, spreadsheet_id, client):
     sheet_id = sheet.get('properties', {}).get('sheetId')
     sheet_title = sheet.get('properties', {}).get('title')
@@ -164,14 +221,23 @@ def get_sheet_metadata(sheet, spreadsheet_id, client):
     stream_metadata = STREAMS.get(stream_name)
     api = stream_metadata.get('api', 'sheets')
     params = stream_metadata.get('params', {})
-    querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]).replace('{sheet_title}', sheet_title)
-    path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', spreadsheet_id), querystring)
+    querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \
+        params.items()]).replace('{sheet_title}', sheet_title)
+    path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
+        spreadsheet_id), querystring)
 
     sheet_md_results = client.get(path=path, api=api, endpoint=stream_name)
-    sheet_cols = sheet_md_results.get('sheets')[0]
-    sheet_schema, columns = get_sheet_schema_columns(sheet_cols, spreadsheet_id, client)
+    # sheet_metadata: 1st `sheets` node in results
+    sheet_metadata = sheet_md_results.get('sheets')[0]
+
+    # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results)
+    try:
+        sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata)
+    except:
+        LOGGER.info('SKIPPING Malformed sheet: {}'.format(sheet_title))
+        sheet_json_schema, columns = None, None
 
-    return sheet_schema, columns
+    return sheet_json_schema, columns
 
 
 def get_abs_path(path):
@@ -199,30 +265,36 @@ def get_schemas(client, spreadsheet_id):
             replication_method=stream_metadata.get('replication_method', None)
         )
         field_metadata[stream_name] = mdata
-        
+
         if stream_name == 'spreadsheet_metadata':
             api = stream_metadata.get('api', 'sheets')
             params = stream_metadata.get('params', {})
             querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()])
-            path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', spreadsheet_id), querystring)
+            path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
+                spreadsheet_id), querystring)
 
-            spreadsheet_md_results = client.get(path=path, params=querystring, api=api, endpoint=stream_name)
+            # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet)
+            spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \
+                endpoint=stream_name)
 
             sheets = spreadsheet_md_results.get('sheets')
             if sheets:
+                # Loop thru each worksheet in spreadsheet
                 for sheet in sheets:
-                    sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
-                    # LOGGER.info('sheet_schema = {}'.format(json.dumps(sheet_schema, indent=2, sort_keys=True)))
-
-                    sheet_title = sheet.get('properties', {}).get('title')
-                    schemas[sheet_title] = sheet_schema
-                    sheet_mdata = metadata.new()
-                    sheet_mdata = metadata.get_standard_metadata(
-                        schema=sheet_schema,
-                        key_properties=['__sdc_row'],
-                        valid_replication_keys=None,
-                        replication_method='FULL_TABLE'
-                    )
-                    field_metadata[sheet_title] = sheet_mdata
-            
+                    # GET sheet_json_schema for each worksheet (from function above)
+                    sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
+
+                    # SKIP empty sheets (where sheet_json_schema and columns are None)
+                    if sheet_json_schema and columns:
+                        sheet_title = sheet.get('properties', {}).get('title')
+                        schemas[sheet_title] = sheet_json_schema
+                        sheet_mdata = metadata.new()
+                        sheet_mdata = metadata.get_standard_metadata(
+                            schema=sheet_json_schema,
+                            key_properties=['__sdc_row'],
+                            valid_replication_keys=None,
+                            replication_method='FULL_TABLE'
+                        )
+                        field_metadata[sheet_title] = sheet_mdata
+
     return schemas, field_metadata