From 376f1145837541d4fff2ad0e499236761f9873c3 Mon Sep 17 00:00:00 2001
From: Jeff Huth <39202799+jeffhuth-bytecode@users.noreply.github.com>
Date: Mon, 24 Feb 2020 09:53:26 -0800
Subject: v.0.0.4 Logic to skip empty sheets (#4)

* v.0.0.2 schema and sync changes

Change number json schema to anyOf with multipleOf; skip empty rows; move write_bookmark to end of sync.py

* v.0.0.3 Sync activate version and error handling

Update README.md documentation. Improved logging and handling of errors and warnings. Better null handling in Discovery and Sync. Fix issues with activate version messages.

* v.0.0.4 Skip empty worksheets

Add logic to skip empty worksheets in Discovery and Sync mode.

* schema.py fix number datatype issue

Nomber datatypes are being created as strings in targets. The JSON schema order needs to be adjusted so that order is null, number, string.
---
 tap_google_sheets/schema.py | 365 ++++++++++++++++++++++----------------------
 1 file changed, 185 insertions(+), 180 deletions(-)

(limited to 'tap_google_sheets/schema.py')

diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py
index e319c03..c229d72 100644
--- a/tap_google_sheets/schema.py
+++ b/tap_google_sheets/schema.py
@@ -25,184 +25,188 @@ def get_sheet_schema_columns(sheet):
     sheet_json_schema = OrderedDict()
     data = next(iter(sheet.get('data', [])), {})
     row_data = data.get('rowData', [])
-    # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
-
-    headers = row_data[0].get('values', [])
-    first_values = row_data[1].get('values', [])
-    # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
-
-    sheet_json_schema = {
-        'type': 'object',
-        'additionalProperties': False,
-        'properties': {
-            '__sdc_spreadsheet_id': {
-                'type': ['null', 'string']
-            },
-            '__sdc_sheet_id': {
-                'type': ['null', 'integer']
-            },
-            '__sdc_row': {
-                'type': ['null', 'integer']
+    if row_data == []:
+        # Empty sheet, SKIP
+        LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title))
+        return None, None
+    else:
+        # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
+        headers = row_data[0].get('values', [])
+        first_values = row_data[1].get('values', [])
+        # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
+
+        sheet_json_schema = {
+            'type': 'object',
+            'additionalProperties': False,
+            'properties': {
+                '__sdc_spreadsheet_id': {
+                    'type': ['null', 'string']
+                },
+                '__sdc_sheet_id': {
+                    'type': ['null', 'integer']
+                },
+                '__sdc_row': {
+                    'type': ['null', 'integer']
+                }
             }
         }
-    }
-
-    header_list = [] # used for checking uniqueness
-    columns = []
-    prior_header = None
-    i = 0
-    skipped = 0
-    # Read column headers until end or 2 consecutive skipped headers
-    for header in headers:
-        # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
-        column_index = i + 1
-        column_letter = colnum_string(column_index)
-        header_value = header.get('formattedValue')
-        if header_value: # NOT skipped
-            column_is_skipped = False
-            skipped = 0
-            column_name = '{}'.format(header_value)
-            if column_name in header_list:
-                raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
-                    sheet_title, column_name, column_letter))
-            header_list.append(column_name)
-
-            first_value = None
-            try:
-                first_value = first_values[i]
-            except IndexError as err:
-                raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
-                    sheet_title, column_name, column_letter, err))
-            
-            column_effective_value = first_value.get('effectiveValue', {})
-
-            col_val = None
-            if column_effective_value == {}:
-                column_effective_value_type = 'stringValue'
-                LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
-                    sheet_title, column_name, column_letter))
-                LOGGER.info('   Setting column datatype to STRING')
-            else:
-                for key, val in column_effective_value.items():
-                    if key in ('numberValue', 'stringValue', 'boolValue'):
-                        column_effective_value_type = key
-                        col_val = str(val)
-                    elif key in ('errorType', 'formulaType'):
-                        col_val = str(val)
-                        raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
-                            sheet_title, column_name, column_letter, key, col_val))
-
-            column_number_format = first_values[i].get('effectiveFormat', {}).get(
-                'numberFormat', {})
-            column_number_format_type = column_number_format.get('type')
-
-            # Determine datatype for sheet_json_schema
-            #
-            # column_effective_value_type = numberValue, stringValue, boolValue;
-            #  INVALID: errorType, formulaType
-            #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
-            #
-            # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
-            #   TIME, DATE_TIME, SCIENTIFIC
-            #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
-            #
-            column_format = None # Default
-            if column_effective_value == {}:
-                col_properties = {'type': ['null', 'string']}
-                column_gs_type = 'stringValue'
-                LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
+
+        header_list = [] # used for checking uniqueness
+        columns = []
+        prior_header = None
+        i = 0
+        skipped = 0
+        # Read column headers until end or 2 consecutive skipped headers
+        for header in headers:
+            # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
+            column_index = i + 1
+            column_letter = colnum_string(column_index)
+            header_value = header.get('formattedValue')
+            if header_value: # NOT skipped
+                column_is_skipped = False
+                skipped = 0
+                column_name = '{}'.format(header_value)
+                if column_name in header_list:
+                    raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
                         sheet_title, column_name, column_letter))
-                LOGGER.info('   Setting column datatype to STRING')
-            elif column_effective_value_type == 'stringValue':
-                col_properties = {'type': ['null', 'string']}
-                column_gs_type = 'stringValue'
-            elif column_effective_value_type == 'boolValue':
-                col_properties = {'type': ['null', 'boolean', 'string']}
-                column_gs_type = 'boolValue'
-            elif column_effective_value_type == 'numberValue':
-                if column_number_format_type == 'DATE_TIME':
-                    col_properties = {
-                        'type': ['null', 'string'],
-                        'format': 'date-time'
-                    }
-                    column_gs_type = 'numberType.DATE_TIME'
-                elif column_number_format_type == 'DATE':
-                    col_properties = {
-                        'type': ['null', 'string'],
-                        'format': 'date'
-                    }
-                    column_gs_type = 'numberType.DATE'
-                elif column_number_format_type == 'TIME':
-                    col_properties = {
-                        'type': ['null', 'string'],
-                        'format': 'time'
-                    }
-                    column_gs_type = 'numberType.TIME'
-                elif column_number_format_type == 'TEXT':
+                header_list.append(column_name)
+
+                first_value = None
+                try:
+                    first_value = first_values[i]
+                except IndexError as err:
+                    raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
+                        sheet_title, column_name, column_letter, err))
+                
+                column_effective_value = first_value.get('effectiveValue', {})
+
+                col_val = None
+                if column_effective_value == {}:
+                    column_effective_value_type = 'stringValue'
+                    LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
+                        sheet_title, column_name, column_letter))
+                    LOGGER.info('   Setting column datatype to STRING')
+                else:
+                    for key, val in column_effective_value.items():
+                        if key in ('numberValue', 'stringValue', 'boolValue'):
+                            column_effective_value_type = key
+                            col_val = str(val)
+                        elif key in ('errorType', 'formulaType'):
+                            col_val = str(val)
+                            raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
+                                sheet_title, column_name, column_letter, key, col_val))
+
+                column_number_format = first_values[i].get('effectiveFormat', {}).get(
+                    'numberFormat', {})
+                column_number_format_type = column_number_format.get('type')
+
+                # Determine datatype for sheet_json_schema
+                #
+                # column_effective_value_type = numberValue, stringValue, boolValue;
+                #  INVALID: errorType, formulaType
+                #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
+                #
+                # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
+                #   TIME, DATE_TIME, SCIENTIFIC
+                #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
+                #
+                column_format = None # Default
+                if column_effective_value == {}:
+                    col_properties = {'type': ['null', 'string']}
+                    column_gs_type = 'stringValue'
+                    LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
+                            sheet_title, column_name, column_letter))
+                    LOGGER.info('   Setting column datatype to STRING')
+                elif column_effective_value_type == 'stringValue':
                     col_properties = {'type': ['null', 'string']}
                     column_gs_type = 'stringValue'
+                elif column_effective_value_type == 'boolValue':
+                    col_properties = {'type': ['null', 'boolean', 'string']}
+                    column_gs_type = 'boolValue'
+                elif column_effective_value_type == 'numberValue':
+                    if column_number_format_type == 'DATE_TIME':
+                        col_properties = {
+                            'type': ['null', 'string'],
+                            'format': 'date-time'
+                        }
+                        column_gs_type = 'numberType.DATE_TIME'
+                    elif column_number_format_type == 'DATE':
+                        col_properties = {
+                            'type': ['null', 'string'],
+                            'format': 'date'
+                        }
+                        column_gs_type = 'numberType.DATE'
+                    elif column_number_format_type == 'TIME':
+                        col_properties = {
+                            'type': ['null', 'string'],
+                            'format': 'time'
+                        }
+                        column_gs_type = 'numberType.TIME'
+                    elif column_number_format_type == 'TEXT':
+                        col_properties = {'type': ['null', 'string']}
+                        column_gs_type = 'stringValue'
+                    else:
+                        # Interesting - order in the anyOf makes a difference.
+                        # Number w/ multipleOf must be listed last, otherwise errors occur.
+                        col_properties =  {
+                            'anyOf': [
+                                {
+                                    'type': 'null'
+                                },
+                                {
+                                    'type': 'number',
+                                    'multipleOf': 1e-15
+                                },
+                                {
+                                    'type': 'string'
+                                }
+                            ]
+                        }
+                        column_gs_type = 'numberType'
+                # Catch-all to deal with other types and set to string
+                # column_effective_value_type: formulaValue, errorValue, or other
                 else:
-                    # Interesting - order in the anyOf makes a difference.
-                    # Number w/ multipleOf must be listed last, otherwise errors occur.
-                    col_properties =  {
-                        'anyOf': [
-                            {
-                                'type': 'string'
-                            },
-                            {
-                                'type': 'null'
-                            },
-                            {
-                                'type': 'number',
-                                'multipleOf': 1e-15
-                            }
-                        ]
-                    }
-                    column_gs_type = 'numberType'
-            # Catch-all to deal with other types and set to string
-            # column_effective_value_type: formulaValue, errorValue, or other
-            else:
+                    col_properties = {'type': ['null', 'string']}
+                    column_gs_type = 'unsupportedValue'
+                    LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
+                            sheet_title, column_name, column_letter, column_effective_value_type, col_val))
+                    LOGGER.info('Converting to string.')
+            else: # skipped
+                column_is_skipped = True
+                skipped = skipped + 1
+                column_index_str = str(column_index).zfill(2)
+                column_name = '__sdc_skip_col_{}'.format(column_index_str)
                 col_properties = {'type': ['null', 'string']}
-                column_gs_type = 'unsupportedValue'
-                LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
-                        sheet_title, column_name, column_letter, column_effective_value_type, col_val))
-                LOGGER.info('Converting to string.')
-        else: # skipped
-            column_is_skipped = True
-            skipped = skipped + 1
-            column_index_str = str(column_index).zfill(2)
-            column_name = '__sdc_skip_col_{}'.format(column_index_str)
-            col_properties = {'type': ['null', 'string']}
-            column_gs_type = 'stringValue'
-            LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
-                sheet_title, column_name, column_letter))
-            LOGGER.info('  This column will be skipped during data loading.')
-
-        if skipped >= 2:
-            # skipped = 2 consecutive skipped headers
-            # Remove prior_header column_name
-            sheet_json_schema['properties'].pop(prior_header, None)
-            LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
-                sheet_title, column_name, column_letter))
-            break
-
-        else:
-            column = {}
-            column = {
-                'columnIndex': column_index,
-                'columnLetter': column_letter,
-                'columnName': column_name,
-                'columnType': column_gs_type,
-                'columnSkipped': column_is_skipped
-            }
-            columns.append(column)
+                column_gs_type = 'stringValue'
+                LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
+                    sheet_title, column_name, column_letter))
+                LOGGER.info('  This column will be skipped during data loading.')
 
-            sheet_json_schema['properties'][column_name] = col_properties
+            if skipped >= 2:
+                # skipped = 2 consecutive skipped headers
+                # Remove prior_header column_name
+                sheet_json_schema['properties'].pop(prior_header, None)
+                LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
+                    sheet_title, column_name, column_letter))
+                break
 
-        prior_header = column_name
-        i = i + 1
+            else:
+                column = {}
+                column = {
+                    'columnIndex': column_index,
+                    'columnLetter': column_letter,
+                    'columnName': column_name,
+                    'columnType': column_gs_type,
+                    'columnSkipped': column_is_skipped
+                }
+                columns.append(column)
 
-    return sheet_json_schema, columns
+                sheet_json_schema['properties'][column_name] = col_properties
+
+            prior_header = column_name
+            i = i + 1
+
+        return sheet_json_schema, columns
 
 
 # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query
@@ -276,17 +280,18 @@ def get_schemas(client, spreadsheet_id):
                 for sheet in sheets:
                     # GET sheet_json_schema for each worksheet (from function above)
                     sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
-                    # LOGGER.info('columns = {}'.format(columns))
-
-                    sheet_title = sheet.get('properties', {}).get('title')
-                    schemas[sheet_title] = sheet_json_schema
-                    sheet_mdata = metadata.new()
-                    sheet_mdata = metadata.get_standard_metadata(
-                        schema=sheet_json_schema,
-                        key_properties=['__sdc_row'],
-                        valid_replication_keys=None,
-                        replication_method='FULL_TABLE'
-                    )
-                    field_metadata[sheet_title] = sheet_mdata
+
+                    # SKIP empty sheets (where sheet_json_schema and columns are None)
+                    if sheet_json_schema and columns:
+                        sheet_title = sheet.get('properties', {}).get('title')
+                        schemas[sheet_title] = sheet_json_schema
+                        sheet_mdata = metadata.new()
+                        sheet_mdata = metadata.get_standard_metadata(
+                            schema=sheet_json_schema,
+                            key_properties=['__sdc_row'],
+                            valid_replication_keys=None,
+                            replication_method='FULL_TABLE'
+                        )
+                        field_metadata[sheet_title] = sheet_mdata
 
     return schemas, field_metadata
-- 
cgit v1.2.3