diff options
author | Jeff Huth <39202799+jeffhuth-bytecode@users.noreply.github.com> | 2019-12-04 06:10:46 -0800 |
---|---|---|
committer | Kyle Allan <KAllan357@gmail.com> | 2019-12-04 09:10:46 -0500 |
commit | 5890b89c1aa7c554235b3cef156b5a5a2c594bec (patch) | |
tree | 2f553cda853991aedb1cec68dc6d06b87d6fe190 /tap_google_sheets | |
parent | 075af7096d3c1b369702feba4076c25b954732dc (diff) | |
download | tap-google-sheets-8b2afddb15d10a6727c50c9c6feaeff6060b114e.tar.gz tap-google-sheets-8b2afddb15d10a6727c50c9c6feaeff6060b114e.tar.zst tap-google-sheets-8b2afddb15d10a6727c50c9c6feaeff6060b114e.zip |
v.0.0.2 schema and sync changes (#1)v0.0.2
Change number json schema to anyOf with multipleOf; skip empty rows; move write_bookmark to end of sync.py
Diffstat (limited to 'tap_google_sheets')
-rw-r--r-- | tap_google_sheets/schema.py | 83 | ||||
-rw-r--r-- | tap_google_sheets/streams.py | 10 | ||||
-rw-r--r-- | tap_google_sheets/sync.py | 166 |
3 files changed, 160 insertions, 99 deletions
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py index d4fead5..243467b 100644 --- a/tap_google_sheets/schema.py +++ b/tap_google_sheets/schema.py | |||
@@ -30,8 +30,6 @@ def get_sheet_schema_columns(sheet): | |||
30 | first_values = row_data[1].get('values', []) | 30 | first_values = row_data[1].get('values', []) |
31 | # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) | 31 | # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) |
32 | 32 | ||
33 | sheet_json_schema['type'] = 'object' | ||
34 | sheet_json_schema['additionalProperties'] = False | ||
35 | sheet_json_schema = { | 33 | sheet_json_schema = { |
36 | 'type': 'object', | 34 | 'type': 'object', |
37 | 'additionalProperties': False, | 35 | 'additionalProperties': False, |
@@ -89,42 +87,66 @@ def get_sheet_schema_columns(sheet): | |||
89 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType | 87 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType |
90 | # | 88 | # |
91 | column_format = None # Default | 89 | column_format = None # Default |
92 | # column_multiple_of = None # Default | ||
93 | if column_effective_value_type == 'stringValue': | 90 | if column_effective_value_type == 'stringValue': |
94 | column_type = ['null', 'string'] | 91 | col_properties = {'type': ['null', 'string']} |
95 | column_gs_type = 'stringValue' | 92 | column_gs_type = 'stringValue' |
96 | elif column_effective_value_type == 'boolValue': | 93 | elif column_effective_value_type == 'boolValue': |
97 | column_type = ['null', 'boolean', 'string'] | 94 | col_properties = {'type': ['null', 'boolean', 'string']} |
98 | column_gs_type = 'boolValue' | 95 | column_gs_type = 'boolValue' |
99 | elif column_effective_value_type == 'numberValue': | 96 | elif column_effective_value_type == 'numberValue': |
100 | if column_number_format_type == 'DATE_TIME': | 97 | if column_number_format_type == 'DATE_TIME': |
101 | column_type = ['null', 'string'] | 98 | col_properties = { |
102 | column_format = 'date-time' | 99 | 'type': ['null', 'string'], |
100 | 'format': 'date-time' | ||
101 | } | ||
103 | column_gs_type = 'numberType.DATE_TIME' | 102 | column_gs_type = 'numberType.DATE_TIME' |
104 | elif column_number_format_type == 'DATE': | 103 | elif column_number_format_type == 'DATE': |
105 | column_type = ['null', 'string'] | 104 | col_properties = { |
106 | column_format = 'date' | 105 | 'type': ['null', 'string'], |
106 | 'format': 'date' | ||
107 | } | ||
107 | column_gs_type = 'numberType.DATE' | 108 | column_gs_type = 'numberType.DATE' |
108 | elif column_number_format_type == 'TIME': | 109 | elif column_number_format_type == 'TIME': |
109 | column_type = ['null', 'string'] | 110 | col_properties = { |
110 | column_format = 'time' | 111 | 'type': ['null', 'string'], |
112 | 'format': 'time' | ||
113 | } | ||
111 | column_gs_type = 'numberType.TIME' | 114 | column_gs_type = 'numberType.TIME' |
112 | elif column_number_format_type == 'TEXT': | 115 | elif column_number_format_type == 'TEXT': |
113 | column_type = ['null', 'string'] | 116 | col_properties = {'type': ['null', 'string']} |
114 | column_gs_type = 'stringValue' | 117 | column_gs_type = 'stringValue' |
115 | else: | 118 | else: |
116 | column_type = ['null', 'number', 'string'] | 119 | # Interesting - order in the anyOf makes a difference. |
120 | # Number w/ multipleOf must be listed last, otherwise errors occur. | ||
121 | col_properties = { | ||
122 | 'anyOf': [ | ||
123 | { | ||
124 | 'type': 'string' | ||
125 | }, | ||
126 | { | ||
127 | 'type': 'null' | ||
128 | }, | ||
129 | { | ||
130 | 'type': 'number', | ||
131 | 'multipleOf': 1e-15 | ||
132 | } | ||
133 | ] | ||
134 | } | ||
117 | column_gs_type = 'numberType' | 135 | column_gs_type = 'numberType' |
118 | elif column_effective_value_type in ('formulaValue', 'errorValue'): | 136 | # Catch-all to deal with other types and set to string |
119 | raise Exception('INVALID DATA TYPE ERROR: {}, value: {}'.format(column_name, \ | 137 | # column_effective_value_type: formulaValue, errorValue, or other |
138 | else: | ||
139 | col_properties = {'type': ['null', 'string']} | ||
140 | column_gs_type = 'unsupportedValue' | ||
141 | LOGGER.info('Unsupported data type: {}, value: {}'.format(column_name, \ | ||
120 | column_effective_value_type)) | 142 | column_effective_value_type)) |
143 | LOGGER.info('Converting to string.') | ||
121 | else: # skipped | 144 | else: # skipped |
122 | column_is_skipped = True | 145 | column_is_skipped = True |
123 | skipped = skipped + 1 | 146 | skipped = skipped + 1 |
124 | column_index_str = str(column_index).zfill(2) | 147 | column_index_str = str(column_index).zfill(2) |
125 | column_name = '__sdc_skip_col_{}'.format(column_index_str) | 148 | column_name = '__sdc_skip_col_{}'.format(column_index_str) |
126 | column_type = ['null', 'string'] | 149 | col_properties = {'type': ['null', 'string']} |
127 | column_format = None | ||
128 | column_gs_type = 'stringValue' | 150 | column_gs_type = 'stringValue' |
129 | 151 | ||
130 | if skipped >= 2: | 152 | if skipped >= 2: |
@@ -144,10 +166,7 @@ def get_sheet_schema_columns(sheet): | |||
144 | } | 166 | } |
145 | columns.append(column) | 167 | columns.append(column) |
146 | 168 | ||
147 | sheet_json_schema['properties'][column_name] = column | 169 | sheet_json_schema['properties'][column_name] = col_properties |
148 | sheet_json_schema['properties'][column_name]['type'] = column_type | ||
149 | if column_format: | ||
150 | sheet_json_schema['properties'][column_name]['format'] = column_format | ||
151 | 170 | ||
152 | prior_header = column_name | 171 | prior_header = column_name |
153 | i = i + 1 | 172 | i = i + 1 |
@@ -155,6 +174,10 @@ def get_sheet_schema_columns(sheet): | |||
155 | return sheet_json_schema, columns | 174 | return sheet_json_schema, columns |
156 | 175 | ||
157 | 176 | ||
177 | # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query | ||
178 | # endpoint: spreadsheets/{spreadsheet_id} | ||
179 | # params: includeGridData = true, ranges = '{sheet_title}'!1:2 | ||
180 | # This endpoint includes detailed metadata about each cell - incl. data type, formatting, etc. | ||
158 | def get_sheet_metadata(sheet, spreadsheet_id, client): | 181 | def get_sheet_metadata(sheet, spreadsheet_id, client): |
159 | sheet_id = sheet.get('properties', {}).get('sheetId') | 182 | sheet_id = sheet.get('properties', {}).get('sheetId') |
160 | sheet_title = sheet.get('properties', {}).get('title') | 183 | sheet_title = sheet.get('properties', {}).get('title') |
@@ -170,10 +193,13 @@ def get_sheet_metadata(sheet, spreadsheet_id, client): | |||
170 | spreadsheet_id), querystring) | 193 | spreadsheet_id), querystring) |
171 | 194 | ||
172 | sheet_md_results = client.get(path=path, api=api, endpoint=stream_name) | 195 | sheet_md_results = client.get(path=path, api=api, endpoint=stream_name) |
173 | sheet_cols = sheet_md_results.get('sheets')[0] | 196 | # sheet_metadata: 1st `sheets` node in results |
174 | sheet_schema, columns = get_sheet_schema_columns(sheet_cols) | 197 | sheet_metadata = sheet_md_results.get('sheets')[0] |
175 | 198 | ||
176 | return sheet_schema, columns | 199 | # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results) |
200 | sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata) | ||
201 | |||
202 | return sheet_json_schema, columns | ||
177 | 203 | ||
178 | 204 | ||
179 | def get_abs_path(path): | 205 | def get_abs_path(path): |
@@ -209,20 +235,23 @@ def get_schemas(client, spreadsheet_id): | |||
209 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ | 235 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ |
210 | spreadsheet_id), querystring) | 236 | spreadsheet_id), querystring) |
211 | 237 | ||
238 | # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet) | ||
212 | spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \ | 239 | spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \ |
213 | endpoint=stream_name) | 240 | endpoint=stream_name) |
214 | 241 | ||
215 | sheets = spreadsheet_md_results.get('sheets') | 242 | sheets = spreadsheet_md_results.get('sheets') |
216 | if sheets: | 243 | if sheets: |
244 | # Loop thru each worksheet in spreadsheet | ||
217 | for sheet in sheets: | 245 | for sheet in sheets: |
218 | sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) | 246 | # GET sheet_json_schema for each worksheet (from function above) |
247 | sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) | ||
219 | LOGGER.info('columns = {}'.format(columns)) | 248 | LOGGER.info('columns = {}'.format(columns)) |
220 | 249 | ||
221 | sheet_title = sheet.get('properties', {}).get('title') | 250 | sheet_title = sheet.get('properties', {}).get('title') |
222 | schemas[sheet_title] = sheet_schema | 251 | schemas[sheet_title] = sheet_json_schema |
223 | sheet_mdata = metadata.new() | 252 | sheet_mdata = metadata.new() |
224 | sheet_mdata = metadata.get_standard_metadata( | 253 | sheet_mdata = metadata.get_standard_metadata( |
225 | schema=sheet_schema, | 254 | schema=sheet_json_schema, |
226 | key_properties=['__sdc_row'], | 255 | key_properties=['__sdc_row'], |
227 | valid_replication_keys=None, | 256 | valid_replication_keys=None, |
228 | replication_method='FULL_TABLE' | 257 | replication_method='FULL_TABLE' |
diff --git a/tap_google_sheets/streams.py b/tap_google_sheets/streams.py index b8e3eff..ad5529f 100644 --- a/tap_google_sheets/streams.py +++ b/tap_google_sheets/streams.py | |||
@@ -13,6 +13,8 @@ from collections import OrderedDict | |||
13 | # data_key: JSON element containing the results list for the endpoint; | 13 | # data_key: JSON element containing the results list for the endpoint; |
14 | # default = root (no data_key) | 14 | # default = root (no data_key) |
15 | 15 | ||
16 | # file_metadata: Queries Google Drive API to get file information and see if file has been modified | ||
17 | # Provides audit info about who and when last changed the file. | ||
16 | FILE_METADATA = { | 18 | FILE_METADATA = { |
17 | "api": "files", | 19 | "api": "files", |
18 | "path": "files/{spreadsheet_id}", | 20 | "path": "files/{spreadsheet_id}", |
@@ -24,6 +26,7 @@ FILE_METADATA = { | |||
24 | } | 26 | } |
25 | } | 27 | } |
26 | 28 | ||
29 | # spreadsheet_metadata: Queries spreadsheet to get basic information on spreadhsheet and sheets | ||
27 | SPREADSHEET_METADATA = { | 30 | SPREADSHEET_METADATA = { |
28 | "api": "sheets", | 31 | "api": "sheets", |
29 | "path": "spreadsheets/{spreadsheet_id}", | 32 | "path": "spreadsheets/{spreadsheet_id}", |
@@ -34,6 +37,9 @@ SPREADSHEET_METADATA = { | |||
34 | } | 37 | } |
35 | } | 38 | } |
36 | 39 | ||
40 | # sheet_metadata: Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet. | ||
41 | # This endpoint includes detailed metadata about each cell in the header and first data row | ||
42 | # incl. data type, formatting, etc. | ||
37 | SHEET_METADATA = { | 43 | SHEET_METADATA = { |
38 | "api": "sheets", | 44 | "api": "sheets", |
39 | "path": "spreadsheets/{spreadsheet_id}", | 45 | "path": "spreadsheets/{spreadsheet_id}", |
@@ -45,6 +51,8 @@ SHEET_METADATA = { | |||
45 | } | 51 | } |
46 | } | 52 | } |
47 | 53 | ||
54 | # sheets_loaded: Queries a batch of Rows for each Sheet in the Spreadsheet. | ||
55 | # Each query uses the `values` endpoint, to get data-only, w/out the formatting/type metadata. | ||
48 | SHEETS_LOADED = { | 56 | SHEETS_LOADED = { |
49 | "api": "sheets", | 57 | "api": "sheets", |
50 | "path": "spreadsheets/{spreadsheet_id}/values/'{sheet_title}'!{range_rows}", | 58 | "path": "spreadsheets/{spreadsheet_id}/values/'{sheet_title}'!{range_rows}", |
@@ -58,7 +66,7 @@ SHEETS_LOADED = { | |||
58 | } | 66 | } |
59 | } | 67 | } |
60 | 68 | ||
61 | # Ensure streams are ordered logically | 69 | # Ensure streams are ordered sequentially, logically. |
62 | STREAMS = OrderedDict() | 70 | STREAMS = OrderedDict() |
63 | STREAMS['file_metadata'] = FILE_METADATA | 71 | STREAMS['file_metadata'] = FILE_METADATA |
64 | STREAMS['spreadsheet_metadata'] = SPREADSHEET_METADATA | 72 | STREAMS['spreadsheet_metadata'] = SPREADSHEET_METADATA |
diff --git a/tap_google_sheets/sync.py b/tap_google_sheets/sync.py index d7d7184..76b2e59 100644 --- a/tap_google_sheets/sync.py +++ b/tap_google_sheets/sync.py | |||
@@ -125,11 +125,14 @@ def get_data(stream_name, | |||
125 | range_rows=None): | 125 | range_rows=None): |
126 | if not range_rows: | 126 | if not range_rows: |
127 | range_rows = '' | 127 | range_rows = '' |
128 | # Replace {placeholder} variables in path | ||
128 | path = endpoint_config.get('path', stream_name).replace( | 129 | path = endpoint_config.get('path', stream_name).replace( |
129 | '{spreadsheet_id}', spreadsheet_id).replace('{sheet_title}', stream_name).replace( | 130 | '{spreadsheet_id}', spreadsheet_id).replace('{sheet_title}', stream_name).replace( |
130 | '{range_rows}', range_rows) | 131 | '{range_rows}', range_rows) |
131 | params = endpoint_config.get('params', {}) | 132 | params = endpoint_config.get('params', {}) |
132 | api = endpoint_config.get('api', 'sheets') | 133 | api = endpoint_config.get('api', 'sheets') |
134 | # Add in querystring parameters and replace {placeholder} variables | ||
135 | # querystring function ensures parameters are added but not encoded causing API errors | ||
133 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]).replace( | 136 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]).replace( |
134 | '{sheet_title}', stream_name) | 137 | '{sheet_title}', stream_name) |
135 | data = {} | 138 | data = {} |
@@ -192,7 +195,7 @@ def excel_to_dttm_str(excel_date_sn, timezone_str=None): | |||
192 | timezone_str = 'UTC' | 195 | timezone_str = 'UTC' |
193 | tzn = pytz.timezone(timezone_str) | 196 | tzn = pytz.timezone(timezone_str) |
194 | sec_per_day = 86400 | 197 | sec_per_day = 86400 |
195 | excel_epoch = 25569 # 1970-01-01T00:00:00Z | 198 | excel_epoch = 25569 # 1970-01-01T00:00:00Z, Lotus Notes Serial Number for Epoch Start Date |
196 | epoch_sec = math.floor((excel_date_sn - excel_epoch) * sec_per_day) | 199 | epoch_sec = math.floor((excel_date_sn - excel_epoch) * sec_per_day) |
197 | epoch_dttm = datetime(1970, 1, 1) | 200 | epoch_dttm = datetime(1970, 1, 1) |
198 | excel_dttm = epoch_dttm + timedelta(seconds=epoch_sec) | 201 | excel_dttm = epoch_dttm + timedelta(seconds=epoch_sec) |
@@ -205,85 +208,103 @@ def excel_to_dttm_str(excel_date_sn, timezone_str=None): | |||
205 | # Convert from array of values to JSON with column names as keys | 208 | # Convert from array of values to JSON with column names as keys |
206 | def transform_sheet_data(spreadsheet_id, sheet_id, from_row, columns, sheet_data_rows): | 209 | def transform_sheet_data(spreadsheet_id, sheet_id, from_row, columns, sheet_data_rows): |
207 | sheet_data_tf = [] | 210 | sheet_data_tf = [] |
208 | is_last_row = False | ||
209 | row_num = from_row | 211 | row_num = from_row |
210 | # Create sorted list of columns based on columnIndex | 212 | # Create sorted list of columns based on columnIndex |
211 | cols = sorted(columns, key=lambda i: i['columnIndex']) | 213 | cols = sorted(columns, key=lambda i: i['columnIndex']) |
212 | 214 | ||
213 | # LOGGER.info('sheet_data_rows: {}'.format(sheet_data_rows)) | 215 | # LOGGER.info('sheet_data_rows: {}'.format(sheet_data_rows)) |
214 | for row in sheet_data_rows: | 216 | for row in sheet_data_rows: |
215 | # If empty row, return sheet_data_tf w/ is_last_row and row_num - 1 | 217 | # If empty row, SKIP |
216 | if row == []: | 218 | if row == []: |
217 | is_last_row = True | 219 | LOGGER.info('EMPTY ROW: {}, SKIPPING'.format(row_num)) |
218 | return sheet_data_tf, row_num - 1, is_last_row | 220 | else: |
219 | sheet_data_row_tf = {} | 221 | sheet_data_row_tf = {} |
220 | # Add spreadsheet_id, sheet_id, and row | 222 | # Add spreadsheet_id, sheet_id, and row |
221 | sheet_data_row_tf['__sdc_spreadsheet_id'] = spreadsheet_id | 223 | sheet_data_row_tf['__sdc_spreadsheet_id'] = spreadsheet_id |
222 | sheet_data_row_tf['__sdc_sheet_id'] = sheet_id | 224 | sheet_data_row_tf['__sdc_sheet_id'] = sheet_id |
223 | sheet_data_row_tf['__sdc_row'] = row_num | 225 | sheet_data_row_tf['__sdc_row'] = row_num |
224 | col_num = 1 | 226 | col_num = 1 |
225 | for value in row: | 227 | for value in row: |
226 | # Select column metadata based on column index | 228 | # Select column metadata based on column index |
227 | col = cols[col_num - 1] | 229 | col = cols[col_num - 1] |
228 | col_skipped = col.get('columnSkipped') | 230 | col_skipped = col.get('columnSkipped') |
229 | if not col_skipped: | 231 | if not col_skipped: |
230 | col_name = col.get('columnName') | 232 | col_name = col.get('columnName') |
231 | col_type = col.get('columnType') | 233 | col_type = col.get('columnType') |
232 | # Convert dates/times from Lotus Notes Serial Numbers | 234 | # Convert dates/times from Lotus Notes Serial Numbers |
233 | if col_type == 'numberType.DATE_TIME': | 235 | # DATE-TIME |
234 | if isinstance(value, (int, float)): | 236 | if col_type == 'numberType.DATE_TIME': |
235 | col_val = excel_to_dttm_str(value) | 237 | if isinstance(value, (int, float)): |
236 | else: | 238 | col_val = excel_to_dttm_str(value) |
237 | col_val = str(value) | 239 | else: |
238 | elif col_type == 'numberType.DATE': | ||
239 | if isinstance(value, (int, float)): | ||
240 | col_val = excel_to_dttm_str(value)[:10] | ||
241 | else: | ||
242 | col_val = str(value) | ||
243 | elif col_type == 'numberType.TIME': | ||
244 | if isinstance(value, (int, float)): | ||
245 | try: | ||
246 | total_secs = value * 86400 # seconds in day | ||
247 | col_val = str(timedelta(seconds=total_secs)) | ||
248 | except ValueError: | ||
249 | col_val = str(value) | 240 | col_val = str(value) |
250 | else: | 241 | # DATE |
251 | col_val = str(value) | 242 | elif col_type == 'numberType.DATE': |
252 | elif col_type == 'numberType': | 243 | if isinstance(value, (int, float)): |
253 | if isinstance(value, int): | 244 | col_val = excel_to_dttm_str(value)[:10] |
254 | col_val = int(value) | 245 | else: |
255 | else: | ||
256 | try: | ||
257 | col_val = float(value) | ||
258 | except ValueError: | ||
259 | col_val = str(value) | 246 | col_val = str(value) |
260 | elif col_type == 'stringValue': | 247 | # TIME ONLY (NO DATE) |
261 | col_val = str(value) | 248 | elif col_type == 'numberType.TIME': |
262 | elif col_type == 'boolValue': | 249 | if isinstance(value, (int, float)): |
263 | if isinstance(value, bool): | 250 | try: |
264 | col_val = value | 251 | total_secs = value * 86400 # seconds in day |
265 | elif isinstance(value, str): | 252 | # Create string formatted like HH:MM:SS |
266 | if value.lower() in ('true', 't', 'yes', 'y'): | 253 | col_val = str(timedelta(seconds=total_secs)) |
267 | col_val = True | 254 | except ValueError: |
268 | elif value.lower() in ('false', 'f', 'no', 'n'): | 255 | col_val = str(value) |
269 | col_val = False | ||
270 | else: | 256 | else: |
271 | col_val = str(value) | 257 | col_val = str(value) |
272 | elif isinstance(value, int): | 258 | # NUMBER (INTEGER AND FLOAT) |
273 | if value in (1, -1): | 259 | elif col_type == 'numberType': |
274 | col_val = True | 260 | if isinstance(value, int): |
275 | elif value == 0: | 261 | col_val = int(value) |
276 | col_val = False | 262 | elif isinstance(value, float): |
263 | # Determine float decimal digits | ||
264 | decimal_digits = str(value)[::-1].find('.') | ||
265 | if decimal_digits > 15: | ||
266 | try: | ||
267 | # ROUND to multipleOf: 1e-15 | ||
268 | col_val = float(round(value, 15)) | ||
269 | except ValueError: | ||
270 | col_val = str(value) | ||
271 | else: # decimal_digits <= 15, no rounding | ||
272 | try: | ||
273 | col_val = float(value) | ||
274 | except ValueError: | ||
275 | col_val = str(value) | ||
277 | else: | 276 | else: |
278 | col_val = str(value) | 277 | col_val = str(value) |
279 | 278 | # STRING | |
280 | else: | 279 | elif col_type == 'stringValue': |
281 | col_val = value | 280 | col_val = str(value) |
282 | sheet_data_row_tf[col_name] = col_val | 281 | # BOOLEAN |
283 | col_num = col_num + 1 | 282 | elif col_type == 'boolValue': |
284 | sheet_data_tf.append(sheet_data_row_tf) | 283 | if isinstance(value, bool): |
284 | col_val = value | ||
285 | elif isinstance(value, str): | ||
286 | if value.lower() in ('true', 't', 'yes', 'y'): | ||
287 | col_val = True | ||
288 | elif value.lower() in ('false', 'f', 'no', 'n'): | ||
289 | col_val = False | ||
290 | else: | ||
291 | col_val = str(value) | ||
292 | elif isinstance(value, int): | ||
293 | if value in (1, -1): | ||
294 | col_val = True | ||
295 | elif value == 0: | ||
296 | col_val = False | ||
297 | else: | ||
298 | col_val = str(value) | ||
299 | # OTHER: Convert everything else to a string | ||
300 | else: | ||
301 | col_val = str(value) | ||
302 | sheet_data_row_tf[col_name] = col_val | ||
303 | col_num = col_num + 1 | ||
304 | # APPEND non-empty row | ||
305 | sheet_data_tf.append(sheet_data_row_tf) | ||
285 | row_num = row_num + 1 | 306 | row_num = row_num + 1 |
286 | return sheet_data_tf, row_num, is_last_row | 307 | return sheet_data_tf, row_num |
287 | 308 | ||
288 | 309 | ||
289 | def sync(client, config, catalog, state): | 310 | def sync(client, config, catalog, state): |
@@ -327,7 +348,7 @@ def sync(client, config, catalog, state): | |||
327 | return | 348 | return |
328 | # Sync file_metadata if selected | 349 | # Sync file_metadata if selected |
329 | sync_stream(stream_name, selected_streams, catalog, state, file_metadata_tf, time_extracted) | 350 | sync_stream(stream_name, selected_streams, catalog, state, file_metadata_tf, time_extracted) |
330 | write_bookmark(state, stream_name, strftime(this_datetime)) | 351 | # file_metadata bookmark is updated at the end of sync |
331 | 352 | ||
332 | # SPREADSHEET_METADATA | 353 | # SPREADSHEET_METADATA |
333 | spreadsheet_metadata = {} | 354 | spreadsheet_metadata = {} |
@@ -363,7 +384,7 @@ def sync(client, config, catalog, state): | |||
363 | 384 | ||
364 | # GET sheet_metadata and columns | 385 | # GET sheet_metadata and columns |
365 | sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) | 386 | sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) |
366 | LOGGER.info('sheet_schema: {}'.format(sheet_schema)) | 387 | # LOGGER.info('sheet_schema: {}'.format(sheet_schema)) |
367 | 388 | ||
368 | # Transform sheet_metadata | 389 | # Transform sheet_metadata |
369 | sheet_metadata_tf = transform_sheet_metadata(spreadsheet_id, sheet, columns) | 390 | sheet_metadata_tf = transform_sheet_metadata(spreadsheet_id, sheet, columns) |
@@ -414,7 +435,7 @@ def sync(client, config, catalog, state): | |||
414 | sheet_data_rows = sheet_data.get('values') | 435 | sheet_data_rows = sheet_data.get('values') |
415 | 436 | ||
416 | # Transform batch of rows to JSON with keys for each column | 437 | # Transform batch of rows to JSON with keys for each column |
417 | sheet_data_tf, row_num, is_last_row = transform_sheet_data( | 438 | sheet_data_tf, row_num = transform_sheet_data( |
418 | spreadsheet_id=spreadsheet_id, | 439 | spreadsheet_id=spreadsheet_id, |
419 | sheet_id=sheet_id, | 440 | sheet_id=sheet_id, |
420 | from_row=from_row, | 441 | from_row=from_row, |
@@ -429,7 +450,7 @@ def sync(client, config, catalog, state): | |||
429 | stream_name=sheet_title, | 450 | stream_name=sheet_title, |
430 | records=sheet_data_tf, | 451 | records=sheet_data_tf, |
431 | time_extracted=ss_time_extracted) | 452 | time_extracted=ss_time_extracted) |
432 | LOGGER.info('Sheet: {}, ecords processed: {}'.format( | 453 | LOGGER.info('Sheet: {}, records processed: {}'.format( |
433 | sheet_title, record_count)) | 454 | sheet_title, record_count)) |
434 | 455 | ||
435 | # Update paging from/to_row for next batch | 456 | # Update paging from/to_row for next batch |
@@ -458,7 +479,7 @@ def sync(client, config, catalog, state): | |||
458 | singer.write_message(activate_version_message) | 479 | singer.write_message(activate_version_message) |
459 | 480 | ||
460 | LOGGER.info('FINISHED Syncing Sheet {}, Total Rows: {}'.format( | 481 | LOGGER.info('FINISHED Syncing Sheet {}, Total Rows: {}'.format( |
461 | sheet_title, row_num - 1)) | 482 | sheet_title, row_num - 2)) # subtract 1 for header row |
462 | 483 | ||
463 | stream_name = 'sheet_metadata' | 484 | stream_name = 'sheet_metadata' |
464 | # Sync sheet_metadata if selected | 485 | # Sync sheet_metadata if selected |
@@ -468,4 +489,7 @@ def sync(client, config, catalog, state): | |||
468 | # Sync sheet_metadata if selected | 489 | # Sync sheet_metadata if selected |
469 | sync_stream(stream_name, selected_streams, catalog, state, sheets_loaded) | 490 | sync_stream(stream_name, selected_streams, catalog, state, sheets_loaded) |
470 | 491 | ||
492 | # Update file_metadata bookmark | ||
493 | write_bookmark(state, 'file_metadata', strftime(this_datetime)) | ||
494 | |||
471 | return | 495 | return |