v.0.0.4 Logic to skip empty sheets (#4)v0.0.4

* v.0.0.2 schema and sync changes Change number json schema to anyOf with multipleOf; skip empty rows; move write_bookmark to end of sync.py * v.0.0.3 Sync activate version and error handling Update README.md documentation. Improved logging and handling of errors and warnings. Better null handling in Discovery and Sync. Fix issues with activate version messages. * v.0.0.4 Skip empty worksheets Add logic to skip empty worksheets in Discovery and Sync mode. * schema.py fix number datatype issue Nomber datatypes are being created as strings in targets. The JSON schema order needs to be adjusted so that order is null, number, string.
author: Jeff Huth <39202799+jeffhuth-bytecode@users.noreply.github.com> 2020-02-24 09:53:26 -0800
committer: GitHub <noreply@github.com> 2020-02-24 12:53:26 -0500
commit: 376f1145837541d4fff2ad0e499236761f9873c3 (patch)
tree: cc086f18b24bda8a86c16c3ec742b89947f382ae /tap_google_sheets
parent: f1d1d43c6b74a8705e91e908c582e39c68464c0c (diff)
download: tap-google-sheets-376f1145837541d4fff2ad0e499236761f9873c3.tar.gz
tap-google-sheets-376f1145837541d4fff2ad0e499236761f9873c3.tar.zst
tap-google-sheets-376f1145837541d4fff2ad0e499236761f9873c3.zip
2 files changed, 294 insertions, 285 deletions
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py
index e319c03..c229d72 100644
--- a/tap_google_sheets/schema.py
+++ b/tap_google_sheets/schema.py
@@ -25,184 +25,188 @@ def get_sheet_schema_columns(sheet):
    sheet_json_schema = OrderedDict()
    data = next(iter(sheet.get('data', [])), {})
    row_data = data.get('rowData', [])
-    # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
+    if row_data == []:
+        # Empty sheet, SKIP
-    headers = row_data[0].get('values', [])
+        LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title))
-    first_values = row_data[1].get('values', [])
+        return None, None
-    # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
+    else:
+        # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
-    sheet_json_schema = {
+        headers = row_data[0].get('values', [])
-        'type': 'object',
+        first_values = row_data[1].get('values', [])
-        'additionalProperties': False,
+        # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
-        'properties': {
-            '__sdc_spreadsheet_id': {
+        sheet_json_schema = {
-                'type': ['null', 'string']
+            'type': 'object',
-            },
+            'additionalProperties': False,
-            '__sdc_sheet_id': {
+            'properties': {
-                'type': ['null', 'integer']
+                '__sdc_spreadsheet_id': {
-            },
+                    'type': ['null', 'string']
-            '__sdc_row': {
+                },
-                'type': ['null', 'integer']
+                '__sdc_sheet_id': {
+                    'type': ['null', 'integer']
+                },
+                '__sdc_row': {
+                    'type': ['null', 'integer']
+                }
            }
        }
-    }
+        header_list = [] # used for checking uniqueness
-    header_list = [] # used for checking uniqueness
+        columns = []
-    columns = []
+        prior_header = None
-    prior_header = None
+        i = 0
-    i = 0
+        skipped = 0
-    skipped = 0
+        # Read column headers until end or 2 consecutive skipped headers
-    # Read column headers until end or 2 consecutive skipped headers
+        for header in headers:
-    for header in headers:
+            # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
-        # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
+            column_index = i + 1
-        column_index = i + 1
+            column_letter = colnum_string(column_index)
-        column_letter = colnum_string(column_index)
+            header_value = header.get('formattedValue')
-        header_value = header.get('formattedValue')
+            if header_value: # NOT skipped
-        if header_value: # NOT skipped
+                column_is_skipped = False
-            column_is_skipped = False
+                skipped = 0
-            skipped = 0
+                column_name = '{}'.format(header_value)
-            column_name = '{}'.format(header_value)
+                if column_name in header_list:
-            if column_name in header_list:
+                    raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
-                raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
-                    sheet_title, column_name, column_letter))
-            header_list.append(column_name)
-            first_value = None
-            try:
-                first_value = first_values[i]
-            except IndexError as err:
-                raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
-                    sheet_title, column_name, column_letter, err))
-            
-            column_effective_value = first_value.get('effectiveValue', {})
-            col_val = None
-            if column_effective_value == {}:
-                column_effective_value_type = 'stringValue'
-                LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
-                    sheet_title, column_name, column_letter))
-                LOGGER.info('   Setting column datatype to STRING')
-            else:
-                for key, val in column_effective_value.items():
-                    if key in ('numberValue', 'stringValue', 'boolValue'):
-                        column_effective_value_type = key
-                        col_val = str(val)
-                    elif key in ('errorType', 'formulaType'):
-                        col_val = str(val)
-                        raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
-                            sheet_title, column_name, column_letter, key, col_val))
-            column_number_format = first_values[i].get('effectiveFormat', {}).get(
-                'numberFormat', {})
-            column_number_format_type = column_number_format.get('type')
-            # Determine datatype for sheet_json_schema
-            #
-            # column_effective_value_type = numberValue, stringValue, boolValue;
-            #  INVALID: errorType, formulaType
-            #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
-            #
-            # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
-            #   TIME, DATE_TIME, SCIENTIFIC
-            #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
-            #
-            column_format = None # Default
-            if column_effective_value == {}:
-                col_properties = {'type': ['null', 'string']}
-                column_gs_type = 'stringValue'
-                LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
                        sheet_title, column_name, column_letter))
-                LOGGER.info('   Setting column datatype to STRING')
+                header_list.append(column_name)
-            elif column_effective_value_type == 'stringValue':
-                col_properties = {'type': ['null', 'string']}
+                first_value = None
-                column_gs_type = 'stringValue'
+                try:
-            elif column_effective_value_type == 'boolValue':
+                    first_value = first_values[i]
-                col_properties = {'type': ['null', 'boolean', 'string']}
+                except IndexError as err:
-                column_gs_type = 'boolValue'
+                    raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
-            elif column_effective_value_type == 'numberValue':
+                        sheet_title, column_name, column_letter, err))
-                if column_number_format_type == 'DATE_TIME':
+                
-                    col_properties = {
+                column_effective_value = first_value.get('effectiveValue', {})
-                        'type': ['null', 'string'],
-                        'format': 'date-time'
+                col_val = None
-                    }
+                if column_effective_value == {}:
-                    column_gs_type = 'numberType.DATE_TIME'
+                    column_effective_value_type = 'stringValue'
-                elif column_number_format_type == 'DATE':
+                    LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
-                    col_properties = {
+                        sheet_title, column_name, column_letter))
-                        'type': ['null', 'string'],
+                    LOGGER.info('   Setting column datatype to STRING')
-                        'format': 'date'
+                else:
-                    }
+                    for key, val in column_effective_value.items():
-                    column_gs_type = 'numberType.DATE'
+                        if key in ('numberValue', 'stringValue', 'boolValue'):
-                elif column_number_format_type == 'TIME':
+                            column_effective_value_type = key
-                    col_properties = {
+                            col_val = str(val)
-                        'type': ['null', 'string'],
+                        elif key in ('errorType', 'formulaType'):
-                        'format': 'time'
+                            col_val = str(val)
-                    }
+                            raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
-                    column_gs_type = 'numberType.TIME'
+                                sheet_title, column_name, column_letter, key, col_val))
-                elif column_number_format_type == 'TEXT':
+                column_number_format = first_values[i].get('effectiveFormat', {}).get(
+                    'numberFormat', {})
+                column_number_format_type = column_number_format.get('type')
+                # Determine datatype for sheet_json_schema
+                #
+                # column_effective_value_type = numberValue, stringValue, boolValue;
+                #  INVALID: errorType, formulaType
+                #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
+                #
+                # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
+                #   TIME, DATE_TIME, SCIENTIFIC
+                #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
+                #
+                column_format = None # Default
+                if column_effective_value == {}:
+                    col_properties = {'type': ['null', 'string']}
+                    column_gs_type = 'stringValue'
+                    LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
+                            sheet_title, column_name, column_letter))
+                    LOGGER.info('   Setting column datatype to STRING')
+                elif column_effective_value_type == 'stringValue':
                    col_properties = {'type': ['null', 'string']}
                    column_gs_type = 'stringValue'
+                elif column_effective_value_type == 'boolValue':
+                    col_properties = {'type': ['null', 'boolean', 'string']}
+                    column_gs_type = 'boolValue'
+                elif column_effective_value_type == 'numberValue':
+                    if column_number_format_type == 'DATE_TIME':
+                        col_properties = {
+                            'type': ['null', 'string'],
+                            'format': 'date-time'
+                        }
+                        column_gs_type = 'numberType.DATE_TIME'
+                    elif column_number_format_type == 'DATE':
+                        col_properties = {
+                            'type': ['null', 'string'],
+                            'format': 'date'
+                        }
+                        column_gs_type = 'numberType.DATE'
+                    elif column_number_format_type == 'TIME':
+                        col_properties = {
+                            'type': ['null', 'string'],
+                            'format': 'time'
+                        }
+                        column_gs_type = 'numberType.TIME'
+                    elif column_number_format_type == 'TEXT':
+                        col_properties = {'type': ['null', 'string']}
+                        column_gs_type = 'stringValue'
+                    else:
+                        # Interesting - order in the anyOf makes a difference.
+                        # Number w/ multipleOf must be listed last, otherwise errors occur.
+                        col_properties =  {
+                            'anyOf': [
+                                {
+                                    'type': 'null'
+                                },
+                                {
+                                    'type': 'number',
+                                    'multipleOf': 1e-15
+                                },
+                                {
+                                    'type': 'string'
+                                }
+                            ]
+                        }
+                        column_gs_type = 'numberType'
+                # Catch-all to deal with other types and set to string
+                # column_effective_value_type: formulaValue, errorValue, or other
                else:
-                    # Interesting - order in the anyOf makes a difference.
+                    col_properties = {'type': ['null', 'string']}
-                    # Number w/ multipleOf must be listed last, otherwise errors occur.
+                    column_gs_type = 'unsupportedValue'
-                    col_properties =  {
+                    LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
-                        'anyOf': [
+                            sheet_title, column_name, column_letter, column_effective_value_type, col_val))
-                            {
+                    LOGGER.info('Converting to string.')
-                                'type': 'string'
+            else: # skipped
-                            },
+                column_is_skipped = True
-                            {
+                skipped = skipped + 1
-                                'type': 'null'
+                column_index_str = str(column_index).zfill(2)
-                            },
+                column_name = '__sdc_skip_col_{}'.format(column_index_str)
-                            {
-                                'type': 'number',
-                                'multipleOf': 1e-15
-                            }
-                        ]
-                    }
-                    column_gs_type = 'numberType'
-            # Catch-all to deal with other types and set to string
-            # column_effective_value_type: formulaValue, errorValue, or other
-            else:
                col_properties = {'type': ['null', 'string']}
-                column_gs_type = 'unsupportedValue'
+                column_gs_type = 'stringValue'
-                LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
+                LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
-                        sheet_title, column_name, column_letter, column_effective_value_type, col_val))
+                    sheet_title, column_name, column_letter))
-                LOGGER.info('Converting to string.')
+                LOGGER.info('  This column will be skipped during data loading.')
-        else: # skipped
-            column_is_skipped = True
-            skipped = skipped + 1
-            column_index_str = str(column_index).zfill(2)
-            column_name = '__sdc_skip_col_{}'.format(column_index_str)
-            col_properties = {'type': ['null', 'string']}
-            column_gs_type = 'stringValue'
-            LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
-                sheet_title, column_name, column_letter))
-            LOGGER.info('  This column will be skipped during data loading.')
-        if skipped >= 2:
-            # skipped = 2 consecutive skipped headers
-            # Remove prior_header column_name
-            sheet_json_schema['properties'].pop(prior_header, None)
-            LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
-                sheet_title, column_name, column_letter))
-            break
-        else:
-            column = {}
-            column = {
-                'columnIndex': column_index,
-                'columnLetter': column_letter,
-                'columnName': column_name,
-                'columnType': column_gs_type,
-                'columnSkipped': column_is_skipped
-            }
-            columns.append(column)
-            sheet_json_schema['properties'][column_name] = col_properties
+            if skipped >= 2:
+                # skipped = 2 consecutive skipped headers
+                # Remove prior_header column_name
+                sheet_json_schema['properties'].pop(prior_header, None)
+                LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
+                    sheet_title, column_name, column_letter))
+                break
-        prior_header = column_name
+            else:
-        i = i + 1
+                column = {}
+                column = {
+                    'columnIndex': column_index,
+                    'columnLetter': column_letter,
+                    'columnName': column_name,
+                    'columnType': column_gs_type,
+                    'columnSkipped': column_is_skipped
+                }
+                columns.append(column)
-    return sheet_json_schema, columns
+                sheet_json_schema['properties'][column_name] = col_properties
+            prior_header = column_name
+            i = i + 1
+        return sheet_json_schema, columns
 # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query
@@ -276,17 +280,18 @@ def get_schemas(client, spreadsheet_id):
                for sheet in sheets:
                    # GET sheet_json_schema for each worksheet (from function above)
                    sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
-                    # LOGGER.info('columns = {}'.format(columns))
+                    # SKIP empty sheets (where sheet_json_schema and columns are None)
-                    sheet_title = sheet.get('properties', {}).get('title')
+                    if sheet_json_schema and columns:
-                    schemas[sheet_title] = sheet_json_schema
+                        sheet_title = sheet.get('properties', {}).get('title')
-                    sheet_mdata = metadata.new()
+                        schemas[sheet_title] = sheet_json_schema
-                    sheet_mdata = metadata.get_standard_metadata(
+                        sheet_mdata = metadata.new()
-                        schema=sheet_json_schema,
+                        sheet_mdata = metadata.get_standard_metadata(
-                        key_properties=['__sdc_row'],
+                            schema=sheet_json_schema,
-                        valid_replication_keys=None,
+                            key_properties=['__sdc_row'],
-                        replication_method='FULL_TABLE'
+                            valid_replication_keys=None,
-                    )
+                            replication_method='FULL_TABLE'
-                    field_metadata[sheet_title] = sheet_mdata
+                        )
+                        field_metadata[sheet_title] = sheet_mdata
    return schemas, field_metadata
diff --git a/tap_google_sheets/sync.py b/tap_google_sheets/sync.py
index 311281c..b77eab3 100644
--- a/tap_google_sheets/sync.py
+++ b/tap_google_sheets/sync.py
@@ -429,113 +429,117 @@ def sync(client, config, catalog, state):
            sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
            # LOGGER.info('sheet_schema: {}'.format(sheet_schema))
-            # Transform sheet_metadata
+            # SKIP empty sheets (where sheet_schema and columns are None)
-            sheet_metadata_tf = transform_sheet_metadata(spreadsheet_id, sheet, columns)
+            if not sheet_schema or not columns:
-            # LOGGER.info('sheet_metadata_tf = {}'.format(sheet_metadata_tf))
+                LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title))
-            sheet_metadata.append(sheet_metadata_tf)
+            else:
+                # Transform sheet_metadata
-            # SHEET_DATA
+                sheet_metadata_tf = transform_sheet_metadata(spreadsheet_id, sheet, columns)
-            # Should this worksheet tab be synced?
+                # LOGGER.info('sheet_metadata_tf = {}'.format(sheet_metadata_tf))
-            if sheet_title in selected_streams:
+                sheet_metadata.append(sheet_metadata_tf)
-                LOGGER.info('STARTED Syncing Sheet {}'.format(sheet_title))
-                update_currently_syncing(state, sheet_title)
+                # SHEET_DATA
-                selected_fields = get_selected_fields(catalog, sheet_title)
+                # Should this worksheet tab be synced?
-                LOGGER.info('Stream: {}, selected_fields: {}'.format(sheet_title, selected_fields))
+                if sheet_title in selected_streams:
-                write_schema(catalog, sheet_title)
+                    LOGGER.info('STARTED Syncing Sheet {}'.format(sheet_title))
+                    update_currently_syncing(state, sheet_title)
-                # Emit a Singer ACTIVATE_VERSION message before initial sync (but not subsequent syncs)
+                    selected_fields = get_selected_fields(catalog, sheet_title)
-                # everytime after each sheet sync is complete.
+                    LOGGER.info('Stream: {}, selected_fields: {}'.format(sheet_title, selected_fields))
-                # This forces hard deletes on the data downstream if fewer records are sent.
+                    write_schema(catalog, sheet_title)
-                # https://github.com/singer-io/singer-python/blob/master/singer/messages.py#L137
-                last_integer = int(get_bookmark(state, sheet_title, 0))
+                    # Emit a Singer ACTIVATE_VERSION message before initial sync (but not subsequent syncs)
-                activate_version = int(time.time() * 1000)
+                    # everytime after each sheet sync is complete.
-                activate_version_message = singer.ActivateVersionMessage(
+                    # This forces hard deletes on the data downstream if fewer records are sent.
-                        stream=sheet_title,
+                    # https://github.com/singer-io/singer-python/blob/master/singer/messages.py#L137
-                        version=activate_version)
+                    last_integer = int(get_bookmark(state, sheet_title, 0))
-                if last_integer == 0:
+                    activate_version = int(time.time() * 1000)
-                    # initial load, send activate_version before AND after data sync
+                    activate_version_message = singer.ActivateVersionMessage(
-                    singer.write_message(activate_version_message)
+                            stream=sheet_title,
-                    LOGGER.info('INITIAL SYNC, Stream: {}, Activate Version: {}'.format(sheet_title, activate_version))
+                            version=activate_version)
+                    if last_integer == 0:
-                # Determine max range of columns and rows for "paging" through the data
+                        # initial load, send activate_version before AND after data sync
-                sheet_last_col_index = 1
+                        singer.write_message(activate_version_message)
-                sheet_last_col_letter = 'A'
+                        LOGGER.info('INITIAL SYNC, Stream: {}, Activate Version: {}'.format(sheet_title, activate_version))
-                for col in columns:
-                    col_index = col.get('columnIndex')
+                    # Determine max range of columns and rows for "paging" through the data
-                    col_letter = col.get('columnLetter')
+                    sheet_last_col_index = 1
-                    if col_index > sheet_last_col_index:
+                    sheet_last_col_letter = 'A'
-                        sheet_last_col_index = col_index
+                    for col in columns:
-                        sheet_last_col_letter = col_letter
+                        col_index = col.get('columnIndex')
-                sheet_max_row = sheet.get('properties').get('gridProperties', {}).get('rowCount')
+                        col_letter = col.get('columnLetter')
+                        if col_index > sheet_last_col_index:
-                # Initialize paging for 1st batch
+                            sheet_last_col_index = col_index
-                is_last_row = False
+                            sheet_last_col_letter = col_letter
-                batch_rows = 200
+                    sheet_max_row = sheet.get('properties').get('gridProperties', {}).get('rowCount')
-                from_row = 2
-                if sheet_max_row < batch_rows:
+                    # Initialize paging for 1st batch
-                    to_row = sheet_max_row
+                    is_last_row = False
-                else:
+                    batch_rows = 200
-                    to_row = batch_rows
+                    from_row = 2
+                    if sheet_max_row < batch_rows:
-                # Loop thru batches (each having 200 rows of data)
-                while not is_last_row and from_row < sheet_max_row and to_row <= sheet_max_row:
-                    range_rows = 'A{}:{}{}'.format(from_row, sheet_last_col_letter, to_row)
-                    # GET sheet_data for a worksheet tab
-                    sheet_data, time_extracted = get_data(
-                        stream_name=sheet_title,
-                        endpoint_config=sheets_loaded_config,
-                        client=client,
-                        spreadsheet_id=spreadsheet_id,
-                        range_rows=range_rows)
-                    # Data is returned as a list of arrays, an array of values for each row
-                    sheet_data_rows = sheet_data.get('values')
-                    # Transform batch of rows to JSON with keys for each column
-                    sheet_data_tf, row_num = transform_sheet_data(
-                        spreadsheet_id=spreadsheet_id,
-                        sheet_id=sheet_id,
-                        sheet_title=sheet_title,
-                        from_row=from_row,
-                        columns=columns,
-                        sheet_data_rows=sheet_data_rows)
-                    if row_num < to_row:
-                        is_last_row = True
-                    # Process records, send batch of records to target
-                    record_count = process_records(
-                        catalog=catalog,
-                        stream_name=sheet_title,
-                        records=sheet_data_tf,
-                        time_extracted=ss_time_extracted,
-                        version=activate_version)
-                    LOGGER.info('Sheet: {}, records processed: {}'.format(
-                        sheet_title, record_count))
-                    
-                    # Update paging from/to_row for next batch
-                    from_row = to_row + 1
-                    if to_row + batch_rows > sheet_max_row:
                        to_row = sheet_max_row
                    else:
-                        to_row = to_row + batch_rows
+                        to_row = batch_rows
-                # End of Stream: Send Activate Version and update State
+                    # Loop thru batches (each having 200 rows of data)
-                singer.write_message(activate_version_message)
+                    while not is_last_row and from_row < sheet_max_row and to_row <= sheet_max_row:
-                write_bookmark(state, sheet_title, activate_version)
+                        range_rows = 'A{}:{}{}'.format(from_row, sheet_last_col_letter, to_row)
-                LOGGER.info('COMPLETE SYNC, Stream: {}, Activate Version: {}'.format(sheet_title, activate_version))
-                LOGGER.info('FINISHED Syncing Sheet {}, Total Rows: {}'.format(
+                        # GET sheet_data for a worksheet tab
-                    sheet_title, row_num - 2)) # subtract 1 for header row
+                        sheet_data, time_extracted = get_data(
-                update_currently_syncing(state, None)
+                            stream_name=sheet_title,
+                            endpoint_config=sheets_loaded_config,
-                # SHEETS_LOADED
+                            client=client,
-                # Add sheet to sheets_loaded
+                            spreadsheet_id=spreadsheet_id,
-                sheet_loaded = {}
+                            range_rows=range_rows)
-                sheet_loaded['spreadsheetId'] = spreadsheet_id
+                        # Data is returned as a list of arrays, an array of values for each row
-                sheet_loaded['sheetId'] = sheet_id
+                        sheet_data_rows = sheet_data.get('values')
-                sheet_loaded['title'] = sheet_title
-                sheet_loaded['loadDate'] = strftime(utils.now())
+                        # Transform batch of rows to JSON with keys for each column
-                sheet_loaded['lastRowNumber'] = row_num
+                        sheet_data_tf, row_num = transform_sheet_data(
-                sheets_loaded.append(sheet_loaded)
+                            spreadsheet_id=spreadsheet_id,
+                            sheet_id=sheet_id,
+                            sheet_title=sheet_title,
+                            from_row=from_row,
+                            columns=columns,
+                            sheet_data_rows=sheet_data_rows)
+                        if row_num < to_row:
+                            is_last_row = True
+                        # Process records, send batch of records to target
+                        record_count = process_records(
+                            catalog=catalog,
+                            stream_name=sheet_title,
+                            records=sheet_data_tf,
+                            time_extracted=ss_time_extracted,
+                            version=activate_version)
+                        LOGGER.info('Sheet: {}, records processed: {}'.format(
+                            sheet_title, record_count))
+                        
+                        # Update paging from/to_row for next batch
+                        from_row = to_row + 1
+                        if to_row + batch_rows > sheet_max_row:
+                            to_row = sheet_max_row
+                        else:
+                            to_row = to_row + batch_rows
+                    # End of Stream: Send Activate Version and update State
+                    singer.write_message(activate_version_message)
+                    write_bookmark(state, sheet_title, activate_version)
+                    LOGGER.info('COMPLETE SYNC, Stream: {}, Activate Version: {}'.format(sheet_title, activate_version))
+                    LOGGER.info('FINISHED Syncing Sheet {}, Total Rows: {}'.format(
+                        sheet_title, row_num - 2)) # subtract 1 for header row
+                    update_currently_syncing(state, None)
+                    # SHEETS_LOADED
+                    # Add sheet to sheets_loaded
+                    sheet_loaded = {}
+                    sheet_loaded['spreadsheetId'] = spreadsheet_id
+                    sheet_loaded['sheetId'] = sheet_id
+                    sheet_loaded['title'] = sheet_title
+                    sheet_loaded['loadDate'] = strftime(utils.now())
+                    sheet_loaded['lastRowNumber'] = row_num
+                    sheets_loaded.append(sheet_loaded)
    stream_name = 'sheet_metadata'
    # Sync sheet_metadata if selected
author	Jeff Huth <39202799+jeffhuth-bytecode@users.noreply.github.com>	2020-02-24 09:53:26 -0800
committer	GitHub <noreply@github.com>	2020-02-24 12:53:26 -0500
commit	376f1145837541d4fff2ad0e499236761f9873c3 (patch)
tree	cc086f18b24bda8a86c16c3ec742b89947f382ae /tap_google_sheets
parent	f1d1d43c6b74a8705e91e908c582e39c68464c0c (diff)
download	tap-google-sheets-376f1145837541d4fff2ad0e499236761f9873c3.tar.gz tap-google-sheets-376f1145837541d4fff2ad0e499236761f9873c3.tar.zst tap-google-sheets-376f1145837541d4fff2ad0e499236761f9873c3.zip