aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorJeff Huth <39202799+jeffhuth-bytecode@users.noreply.github.com>2019-12-04 06:10:46 -0800
committerKyle Allan <KAllan357@gmail.com>2019-12-04 09:10:46 -0500
commit5890b89c1aa7c554235b3cef156b5a5a2c594bec (patch)
tree2f553cda853991aedb1cec68dc6d06b87d6fe190
parent075af7096d3c1b369702feba4076c25b954732dc (diff)
downloadtap-google-sheets-5890b89c1aa7c554235b3cef156b5a5a2c594bec.tar.gz
tap-google-sheets-5890b89c1aa7c554235b3cef156b5a5a2c594bec.tar.zst
tap-google-sheets-5890b89c1aa7c554235b3cef156b5a5a2c594bec.zip
v.0.0.2 schema and sync changes (#1)v0.0.2
Change number json schema to anyOf with multipleOf; skip empty rows; move write_bookmark to end of sync.py
-rw-r--r--CHANGELOG.md3
-rw-r--r--setup.py2
-rw-r--r--tap_google_sheets/schema.py83
-rw-r--r--tap_google_sheets/streams.py10
-rw-r--r--tap_google_sheets/sync.py166
5 files changed, 164 insertions, 100 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d58f396..e5d6560 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,7 @@
1# Changelog 1# Changelog
2 2
3## 0.0.2
4 * Change number json schema to anyOf with multipleOf; skip empty rows; move write_bookmark to end of sync.py
5
3## 0.0.1 6## 0.0.1
4 * Initial commit 7 * Initial commit
diff --git a/setup.py b/setup.py
index e3c4f3e..6fe2493 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
3from setuptools import setup, find_packages 3from setuptools import setup, find_packages
4 4
5setup(name='tap-google-sheets', 5setup(name='tap-google-sheets',
6 version='0.0.1', 6 version='0.0.2',
7 description='Singer.io tap for extracting data from the Google Sheets v4 API', 7 description='Singer.io tap for extracting data from the Google Sheets v4 API',
8 author='jeff.huth@bytecode.io', 8 author='jeff.huth@bytecode.io',
9 classifiers=['Programming Language :: Python :: 3 :: Only'], 9 classifiers=['Programming Language :: Python :: 3 :: Only'],
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py
index d4fead5..243467b 100644
--- a/tap_google_sheets/schema.py
+++ b/tap_google_sheets/schema.py
@@ -30,8 +30,6 @@ def get_sheet_schema_columns(sheet):
30 first_values = row_data[1].get('values', []) 30 first_values = row_data[1].get('values', [])
31 # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) 31 # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
32 32
33 sheet_json_schema['type'] = 'object'
34 sheet_json_schema['additionalProperties'] = False
35 sheet_json_schema = { 33 sheet_json_schema = {
36 'type': 'object', 34 'type': 'object',
37 'additionalProperties': False, 35 'additionalProperties': False,
@@ -89,42 +87,66 @@ def get_sheet_schema_columns(sheet):
89 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType 87 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
90 # 88 #
91 column_format = None # Default 89 column_format = None # Default
92 # column_multiple_of = None # Default
93 if column_effective_value_type == 'stringValue': 90 if column_effective_value_type == 'stringValue':
94 column_type = ['null', 'string'] 91 col_properties = {'type': ['null', 'string']}
95 column_gs_type = 'stringValue' 92 column_gs_type = 'stringValue'
96 elif column_effective_value_type == 'boolValue': 93 elif column_effective_value_type == 'boolValue':
97 column_type = ['null', 'boolean', 'string'] 94 col_properties = {'type': ['null', 'boolean', 'string']}
98 column_gs_type = 'boolValue' 95 column_gs_type = 'boolValue'
99 elif column_effective_value_type == 'numberValue': 96 elif column_effective_value_type == 'numberValue':
100 if column_number_format_type == 'DATE_TIME': 97 if column_number_format_type == 'DATE_TIME':
101 column_type = ['null', 'string'] 98 col_properties = {
102 column_format = 'date-time' 99 'type': ['null', 'string'],
100 'format': 'date-time'
101 }
103 column_gs_type = 'numberType.DATE_TIME' 102 column_gs_type = 'numberType.DATE_TIME'
104 elif column_number_format_type == 'DATE': 103 elif column_number_format_type == 'DATE':
105 column_type = ['null', 'string'] 104 col_properties = {
106 column_format = 'date' 105 'type': ['null', 'string'],
106 'format': 'date'
107 }
107 column_gs_type = 'numberType.DATE' 108 column_gs_type = 'numberType.DATE'
108 elif column_number_format_type == 'TIME': 109 elif column_number_format_type == 'TIME':
109 column_type = ['null', 'string'] 110 col_properties = {
110 column_format = 'time' 111 'type': ['null', 'string'],
112 'format': 'time'
113 }
111 column_gs_type = 'numberType.TIME' 114 column_gs_type = 'numberType.TIME'
112 elif column_number_format_type == 'TEXT': 115 elif column_number_format_type == 'TEXT':
113 column_type = ['null', 'string'] 116 col_properties = {'type': ['null', 'string']}
114 column_gs_type = 'stringValue' 117 column_gs_type = 'stringValue'
115 else: 118 else:
116 column_type = ['null', 'number', 'string'] 119 # Interesting - order in the anyOf makes a difference.
120 # Number w/ multipleOf must be listed last, otherwise errors occur.
121 col_properties = {
122 'anyOf': [
123 {
124 'type': 'string'
125 },
126 {
127 'type': 'null'
128 },
129 {
130 'type': 'number',
131 'multipleOf': 1e-15
132 }
133 ]
134 }
117 column_gs_type = 'numberType' 135 column_gs_type = 'numberType'
118 elif column_effective_value_type in ('formulaValue', 'errorValue'): 136 # Catch-all to deal with other types and set to string
119 raise Exception('INVALID DATA TYPE ERROR: {}, value: {}'.format(column_name, \ 137 # column_effective_value_type: formulaValue, errorValue, or other
138 else:
139 col_properties = {'type': ['null', 'string']}
140 column_gs_type = 'unsupportedValue'
141 LOGGER.info('Unsupported data type: {}, value: {}'.format(column_name, \
120 column_effective_value_type)) 142 column_effective_value_type))
143 LOGGER.info('Converting to string.')
121 else: # skipped 144 else: # skipped
122 column_is_skipped = True 145 column_is_skipped = True
123 skipped = skipped + 1 146 skipped = skipped + 1
124 column_index_str = str(column_index).zfill(2) 147 column_index_str = str(column_index).zfill(2)
125 column_name = '__sdc_skip_col_{}'.format(column_index_str) 148 column_name = '__sdc_skip_col_{}'.format(column_index_str)
126 column_type = ['null', 'string'] 149 col_properties = {'type': ['null', 'string']}
127 column_format = None
128 column_gs_type = 'stringValue' 150 column_gs_type = 'stringValue'
129 151
130 if skipped >= 2: 152 if skipped >= 2:
@@ -144,10 +166,7 @@ def get_sheet_schema_columns(sheet):
144 } 166 }
145 columns.append(column) 167 columns.append(column)
146 168
147 sheet_json_schema['properties'][column_name] = column 169 sheet_json_schema['properties'][column_name] = col_properties
148 sheet_json_schema['properties'][column_name]['type'] = column_type
149 if column_format:
150 sheet_json_schema['properties'][column_name]['format'] = column_format
151 170
152 prior_header = column_name 171 prior_header = column_name
153 i = i + 1 172 i = i + 1
@@ -155,6 +174,10 @@ def get_sheet_schema_columns(sheet):
155 return sheet_json_schema, columns 174 return sheet_json_schema, columns
156 175
157 176
177# Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query
178# endpoint: spreadsheets/{spreadsheet_id}
179# params: includeGridData = true, ranges = '{sheet_title}'!1:2
180# This endpoint includes detailed metadata about each cell - incl. data type, formatting, etc.
158def get_sheet_metadata(sheet, spreadsheet_id, client): 181def get_sheet_metadata(sheet, spreadsheet_id, client):
159 sheet_id = sheet.get('properties', {}).get('sheetId') 182 sheet_id = sheet.get('properties', {}).get('sheetId')
160 sheet_title = sheet.get('properties', {}).get('title') 183 sheet_title = sheet.get('properties', {}).get('title')
@@ -170,10 +193,13 @@ def get_sheet_metadata(sheet, spreadsheet_id, client):
170 spreadsheet_id), querystring) 193 spreadsheet_id), querystring)
171 194
172 sheet_md_results = client.get(path=path, api=api, endpoint=stream_name) 195 sheet_md_results = client.get(path=path, api=api, endpoint=stream_name)
173 sheet_cols = sheet_md_results.get('sheets')[0] 196 # sheet_metadata: 1st `sheets` node in results
174 sheet_schema, columns = get_sheet_schema_columns(sheet_cols) 197 sheet_metadata = sheet_md_results.get('sheets')[0]
175 198
176 return sheet_schema, columns 199 # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results)
200 sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata)
201
202 return sheet_json_schema, columns
177 203
178 204
179def get_abs_path(path): 205def get_abs_path(path):
@@ -209,20 +235,23 @@ def get_schemas(client, spreadsheet_id):
209 path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ 235 path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
210 spreadsheet_id), querystring) 236 spreadsheet_id), querystring)
211 237
238 # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet)
212 spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \ 239 spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \
213 endpoint=stream_name) 240 endpoint=stream_name)
214 241
215 sheets = spreadsheet_md_results.get('sheets') 242 sheets = spreadsheet_md_results.get('sheets')
216 if sheets: 243 if sheets:
244 # Loop thru each worksheet in spreadsheet
217 for sheet in sheets: 245 for sheet in sheets:
218 sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) 246 # GET sheet_json_schema for each worksheet (from function above)
247 sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
219 LOGGER.info('columns = {}'.format(columns)) 248 LOGGER.info('columns = {}'.format(columns))
220 249
221 sheet_title = sheet.get('properties', {}).get('title') 250 sheet_title = sheet.get('properties', {}).get('title')
222 schemas[sheet_title] = sheet_schema 251 schemas[sheet_title] = sheet_json_schema
223 sheet_mdata = metadata.new() 252 sheet_mdata = metadata.new()
224 sheet_mdata = metadata.get_standard_metadata( 253 sheet_mdata = metadata.get_standard_metadata(
225 schema=sheet_schema, 254 schema=sheet_json_schema,
226 key_properties=['__sdc_row'], 255 key_properties=['__sdc_row'],
227 valid_replication_keys=None, 256 valid_replication_keys=None,
228 replication_method='FULL_TABLE' 257 replication_method='FULL_TABLE'
diff --git a/tap_google_sheets/streams.py b/tap_google_sheets/streams.py
index b8e3eff..ad5529f 100644
--- a/tap_google_sheets/streams.py
+++ b/tap_google_sheets/streams.py
@@ -13,6 +13,8 @@ from collections import OrderedDict
13# data_key: JSON element containing the results list for the endpoint; 13# data_key: JSON element containing the results list for the endpoint;
14# default = root (no data_key) 14# default = root (no data_key)
15 15
16# file_metadata: Queries Google Drive API to get file information and see if file has been modified
17# Provides audit info about who and when last changed the file.
16FILE_METADATA = { 18FILE_METADATA = {
17 "api": "files", 19 "api": "files",
18 "path": "files/{spreadsheet_id}", 20 "path": "files/{spreadsheet_id}",
@@ -24,6 +26,7 @@ FILE_METADATA = {
24 } 26 }
25} 27}
26 28
29# spreadsheet_metadata: Queries spreadsheet to get basic information on spreadhsheet and sheets
27SPREADSHEET_METADATA = { 30SPREADSHEET_METADATA = {
28 "api": "sheets", 31 "api": "sheets",
29 "path": "spreadsheets/{spreadsheet_id}", 32 "path": "spreadsheets/{spreadsheet_id}",
@@ -34,6 +37,9 @@ SPREADSHEET_METADATA = {
34 } 37 }
35} 38}
36 39
40# sheet_metadata: Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet.
41# This endpoint includes detailed metadata about each cell in the header and first data row
42# incl. data type, formatting, etc.
37SHEET_METADATA = { 43SHEET_METADATA = {
38 "api": "sheets", 44 "api": "sheets",
39 "path": "spreadsheets/{spreadsheet_id}", 45 "path": "spreadsheets/{spreadsheet_id}",
@@ -45,6 +51,8 @@ SHEET_METADATA = {
45 } 51 }
46} 52}
47 53
54# sheets_loaded: Queries a batch of Rows for each Sheet in the Spreadsheet.
55# Each query uses the `values` endpoint, to get data-only, w/out the formatting/type metadata.
48SHEETS_LOADED = { 56SHEETS_LOADED = {
49 "api": "sheets", 57 "api": "sheets",
50 "path": "spreadsheets/{spreadsheet_id}/values/'{sheet_title}'!{range_rows}", 58 "path": "spreadsheets/{spreadsheet_id}/values/'{sheet_title}'!{range_rows}",
@@ -58,7 +66,7 @@ SHEETS_LOADED = {
58 } 66 }
59} 67}
60 68
61# Ensure streams are ordered logically 69# Ensure streams are ordered sequentially, logically.
62STREAMS = OrderedDict() 70STREAMS = OrderedDict()
63STREAMS['file_metadata'] = FILE_METADATA 71STREAMS['file_metadata'] = FILE_METADATA
64STREAMS['spreadsheet_metadata'] = SPREADSHEET_METADATA 72STREAMS['spreadsheet_metadata'] = SPREADSHEET_METADATA
diff --git a/tap_google_sheets/sync.py b/tap_google_sheets/sync.py
index d7d7184..76b2e59 100644
--- a/tap_google_sheets/sync.py
+++ b/tap_google_sheets/sync.py
@@ -125,11 +125,14 @@ def get_data(stream_name,
125 range_rows=None): 125 range_rows=None):
126 if not range_rows: 126 if not range_rows:
127 range_rows = '' 127 range_rows = ''
128 # Replace {placeholder} variables in path
128 path = endpoint_config.get('path', stream_name).replace( 129 path = endpoint_config.get('path', stream_name).replace(
129 '{spreadsheet_id}', spreadsheet_id).replace('{sheet_title}', stream_name).replace( 130 '{spreadsheet_id}', spreadsheet_id).replace('{sheet_title}', stream_name).replace(
130 '{range_rows}', range_rows) 131 '{range_rows}', range_rows)
131 params = endpoint_config.get('params', {}) 132 params = endpoint_config.get('params', {})
132 api = endpoint_config.get('api', 'sheets') 133 api = endpoint_config.get('api', 'sheets')
134 # Add in querystring parameters and replace {placeholder} variables
135 # querystring function ensures parameters are added but not encoded causing API errors
133 querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]).replace( 136 querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]).replace(
134 '{sheet_title}', stream_name) 137 '{sheet_title}', stream_name)
135 data = {} 138 data = {}
@@ -192,7 +195,7 @@ def excel_to_dttm_str(excel_date_sn, timezone_str=None):
192 timezone_str = 'UTC' 195 timezone_str = 'UTC'
193 tzn = pytz.timezone(timezone_str) 196 tzn = pytz.timezone(timezone_str)
194 sec_per_day = 86400 197 sec_per_day = 86400
195 excel_epoch = 25569 # 1970-01-01T00:00:00Z 198 excel_epoch = 25569 # 1970-01-01T00:00:00Z, Lotus Notes Serial Number for Epoch Start Date
196 epoch_sec = math.floor((excel_date_sn - excel_epoch) * sec_per_day) 199 epoch_sec = math.floor((excel_date_sn - excel_epoch) * sec_per_day)
197 epoch_dttm = datetime(1970, 1, 1) 200 epoch_dttm = datetime(1970, 1, 1)
198 excel_dttm = epoch_dttm + timedelta(seconds=epoch_sec) 201 excel_dttm = epoch_dttm + timedelta(seconds=epoch_sec)
@@ -205,85 +208,103 @@ def excel_to_dttm_str(excel_date_sn, timezone_str=None):
205# Convert from array of values to JSON with column names as keys 208# Convert from array of values to JSON with column names as keys
206def transform_sheet_data(spreadsheet_id, sheet_id, from_row, columns, sheet_data_rows): 209def transform_sheet_data(spreadsheet_id, sheet_id, from_row, columns, sheet_data_rows):
207 sheet_data_tf = [] 210 sheet_data_tf = []
208 is_last_row = False
209 row_num = from_row 211 row_num = from_row
210 # Create sorted list of columns based on columnIndex 212 # Create sorted list of columns based on columnIndex
211 cols = sorted(columns, key=lambda i: i['columnIndex']) 213 cols = sorted(columns, key=lambda i: i['columnIndex'])
212 214
213 # LOGGER.info('sheet_data_rows: {}'.format(sheet_data_rows)) 215 # LOGGER.info('sheet_data_rows: {}'.format(sheet_data_rows))
214 for row in sheet_data_rows: 216 for row in sheet_data_rows:
215 # If empty row, return sheet_data_tf w/ is_last_row and row_num - 1 217 # If empty row, SKIP
216 if row == []: 218 if row == []:
217 is_last_row = True 219 LOGGER.info('EMPTY ROW: {}, SKIPPING'.format(row_num))
218 return sheet_data_tf, row_num - 1, is_last_row 220 else:
219 sheet_data_row_tf = {} 221 sheet_data_row_tf = {}
220 # Add spreadsheet_id, sheet_id, and row 222 # Add spreadsheet_id, sheet_id, and row
221 sheet_data_row_tf['__sdc_spreadsheet_id'] = spreadsheet_id 223 sheet_data_row_tf['__sdc_spreadsheet_id'] = spreadsheet_id
222 sheet_data_row_tf['__sdc_sheet_id'] = sheet_id 224 sheet_data_row_tf['__sdc_sheet_id'] = sheet_id
223 sheet_data_row_tf['__sdc_row'] = row_num 225 sheet_data_row_tf['__sdc_row'] = row_num
224 col_num = 1 226 col_num = 1
225 for value in row: 227 for value in row:
226 # Select column metadata based on column index 228 # Select column metadata based on column index
227 col = cols[col_num - 1] 229 col = cols[col_num - 1]
228 col_skipped = col.get('columnSkipped') 230 col_skipped = col.get('columnSkipped')
229 if not col_skipped: 231 if not col_skipped:
230 col_name = col.get('columnName') 232 col_name = col.get('columnName')
231 col_type = col.get('columnType') 233 col_type = col.get('columnType')
232 # Convert dates/times from Lotus Notes Serial Numbers 234 # Convert dates/times from Lotus Notes Serial Numbers
233 if col_type == 'numberType.DATE_TIME': 235 # DATE-TIME
234 if isinstance(value, (int, float)): 236 if col_type == 'numberType.DATE_TIME':
235 col_val = excel_to_dttm_str(value) 237 if isinstance(value, (int, float)):
236 else: 238 col_val = excel_to_dttm_str(value)
237 col_val = str(value) 239 else:
238 elif col_type == 'numberType.DATE':
239 if isinstance(value, (int, float)):
240 col_val = excel_to_dttm_str(value)[:10]
241 else:
242 col_val = str(value)
243 elif col_type == 'numberType.TIME':
244 if isinstance(value, (int, float)):
245 try:
246 total_secs = value * 86400 # seconds in day
247 col_val = str(timedelta(seconds=total_secs))
248 except ValueError:
249 col_val = str(value) 240 col_val = str(value)
250 else: 241 # DATE
251 col_val = str(value) 242 elif col_type == 'numberType.DATE':
252 elif col_type == 'numberType': 243 if isinstance(value, (int, float)):
253 if isinstance(value, int): 244 col_val = excel_to_dttm_str(value)[:10]
254 col_val = int(value) 245 else:
255 else:
256 try:
257 col_val = float(value)
258 except ValueError:
259 col_val = str(value) 246 col_val = str(value)
260 elif col_type == 'stringValue': 247 # TIME ONLY (NO DATE)
261 col_val = str(value) 248 elif col_type == 'numberType.TIME':
262 elif col_type == 'boolValue': 249 if isinstance(value, (int, float)):
263 if isinstance(value, bool): 250 try:
264 col_val = value 251 total_secs = value * 86400 # seconds in day
265 elif isinstance(value, str): 252 # Create string formatted like HH:MM:SS
266 if value.lower() in ('true', 't', 'yes', 'y'): 253 col_val = str(timedelta(seconds=total_secs))
267 col_val = True 254 except ValueError:
268 elif value.lower() in ('false', 'f', 'no', 'n'): 255 col_val = str(value)
269 col_val = False
270 else: 256 else:
271 col_val = str(value) 257 col_val = str(value)
272 elif isinstance(value, int): 258 # NUMBER (INTEGER AND FLOAT)
273 if value in (1, -1): 259 elif col_type == 'numberType':
274 col_val = True 260 if isinstance(value, int):
275 elif value == 0: 261 col_val = int(value)
276 col_val = False 262 elif isinstance(value, float):
263 # Determine float decimal digits
264 decimal_digits = str(value)[::-1].find('.')
265 if decimal_digits > 15:
266 try:
267 # ROUND to multipleOf: 1e-15
268 col_val = float(round(value, 15))
269 except ValueError:
270 col_val = str(value)
271 else: # decimal_digits <= 15, no rounding
272 try:
273 col_val = float(value)
274 except ValueError:
275 col_val = str(value)
277 else: 276 else:
278 col_val = str(value) 277 col_val = str(value)
279 278 # STRING
280 else: 279 elif col_type == 'stringValue':
281 col_val = value 280 col_val = str(value)
282 sheet_data_row_tf[col_name] = col_val 281 # BOOLEAN
283 col_num = col_num + 1 282 elif col_type == 'boolValue':
284 sheet_data_tf.append(sheet_data_row_tf) 283 if isinstance(value, bool):
284 col_val = value
285 elif isinstance(value, str):
286 if value.lower() in ('true', 't', 'yes', 'y'):
287 col_val = True
288 elif value.lower() in ('false', 'f', 'no', 'n'):
289 col_val = False
290 else:
291 col_val = str(value)
292 elif isinstance(value, int):
293 if value in (1, -1):
294 col_val = True
295 elif value == 0:
296 col_val = False
297 else:
298 col_val = str(value)
299 # OTHER: Convert everything else to a string
300 else:
301 col_val = str(value)
302 sheet_data_row_tf[col_name] = col_val
303 col_num = col_num + 1
304 # APPEND non-empty row
305 sheet_data_tf.append(sheet_data_row_tf)
285 row_num = row_num + 1 306 row_num = row_num + 1
286 return sheet_data_tf, row_num, is_last_row 307 return sheet_data_tf, row_num
287 308
288 309
289def sync(client, config, catalog, state): 310def sync(client, config, catalog, state):
@@ -327,7 +348,7 @@ def sync(client, config, catalog, state):
327 return 348 return
328 # Sync file_metadata if selected 349 # Sync file_metadata if selected
329 sync_stream(stream_name, selected_streams, catalog, state, file_metadata_tf, time_extracted) 350 sync_stream(stream_name, selected_streams, catalog, state, file_metadata_tf, time_extracted)
330 write_bookmark(state, stream_name, strftime(this_datetime)) 351 # file_metadata bookmark is updated at the end of sync
331 352
332 # SPREADSHEET_METADATA 353 # SPREADSHEET_METADATA
333 spreadsheet_metadata = {} 354 spreadsheet_metadata = {}
@@ -363,7 +384,7 @@ def sync(client, config, catalog, state):
363 384
364 # GET sheet_metadata and columns 385 # GET sheet_metadata and columns
365 sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) 386 sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
366 LOGGER.info('sheet_schema: {}'.format(sheet_schema)) 387 # LOGGER.info('sheet_schema: {}'.format(sheet_schema))
367 388
368 # Transform sheet_metadata 389 # Transform sheet_metadata
369 sheet_metadata_tf = transform_sheet_metadata(spreadsheet_id, sheet, columns) 390 sheet_metadata_tf = transform_sheet_metadata(spreadsheet_id, sheet, columns)
@@ -414,7 +435,7 @@ def sync(client, config, catalog, state):
414 sheet_data_rows = sheet_data.get('values') 435 sheet_data_rows = sheet_data.get('values')
415 436
416 # Transform batch of rows to JSON with keys for each column 437 # Transform batch of rows to JSON with keys for each column
417 sheet_data_tf, row_num, is_last_row = transform_sheet_data( 438 sheet_data_tf, row_num = transform_sheet_data(
418 spreadsheet_id=spreadsheet_id, 439 spreadsheet_id=spreadsheet_id,
419 sheet_id=sheet_id, 440 sheet_id=sheet_id,
420 from_row=from_row, 441 from_row=from_row,
@@ -429,7 +450,7 @@ def sync(client, config, catalog, state):
429 stream_name=sheet_title, 450 stream_name=sheet_title,
430 records=sheet_data_tf, 451 records=sheet_data_tf,
431 time_extracted=ss_time_extracted) 452 time_extracted=ss_time_extracted)
432 LOGGER.info('Sheet: {}, ecords processed: {}'.format( 453 LOGGER.info('Sheet: {}, records processed: {}'.format(
433 sheet_title, record_count)) 454 sheet_title, record_count))
434 455
435 # Update paging from/to_row for next batch 456 # Update paging from/to_row for next batch
@@ -458,7 +479,7 @@ def sync(client, config, catalog, state):
458 singer.write_message(activate_version_message) 479 singer.write_message(activate_version_message)
459 480
460 LOGGER.info('FINISHED Syncing Sheet {}, Total Rows: {}'.format( 481 LOGGER.info('FINISHED Syncing Sheet {}, Total Rows: {}'.format(
461 sheet_title, row_num - 1)) 482 sheet_title, row_num - 2)) # subtract 1 for header row
462 483
463 stream_name = 'sheet_metadata' 484 stream_name = 'sheet_metadata'
464 # Sync sheet_metadata if selected 485 # Sync sheet_metadata if selected
@@ -468,4 +489,7 @@ def sync(client, config, catalog, state):
468 # Sync sheet_metadata if selected 489 # Sync sheet_metadata if selected
469 sync_stream(stream_name, selected_streams, catalog, state, sheets_loaded) 490 sync_stream(stream_name, selected_streams, catalog, state, sheets_loaded)
470 491
492 # Update file_metadata bookmark
493 write_bookmark(state, 'file_metadata', strftime(this_datetime))
494
471 return 495 return