diff options
author | Jeff Huth <39202799+jeffhuth-bytecode@users.noreply.github.com> | 2019-12-04 06:10:46 -0800 |
---|---|---|
committer | Kyle Allan <KAllan357@gmail.com> | 2019-12-04 09:10:46 -0500 |
commit | 5890b89c1aa7c554235b3cef156b5a5a2c594bec (patch) | |
tree | 2f553cda853991aedb1cec68dc6d06b87d6fe190 /tap_google_sheets/schema.py | |
parent | 075af7096d3c1b369702feba4076c25b954732dc (diff) | |
download | tap-google-sheets-5890b89c1aa7c554235b3cef156b5a5a2c594bec.tar.gz tap-google-sheets-5890b89c1aa7c554235b3cef156b5a5a2c594bec.tar.zst tap-google-sheets-5890b89c1aa7c554235b3cef156b5a5a2c594bec.zip |
v.0.0.2 schema and sync changes (#1)v0.0.2
Change number json schema to anyOf with multipleOf; skip empty rows; move write_bookmark to end of sync.py
Diffstat (limited to 'tap_google_sheets/schema.py')
-rw-r--r-- | tap_google_sheets/schema.py | 83 |
1 files changed, 56 insertions, 27 deletions
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py index d4fead5..243467b 100644 --- a/tap_google_sheets/schema.py +++ b/tap_google_sheets/schema.py | |||
@@ -30,8 +30,6 @@ def get_sheet_schema_columns(sheet): | |||
30 | first_values = row_data[1].get('values', []) | 30 | first_values = row_data[1].get('values', []) |
31 | # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) | 31 | # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) |
32 | 32 | ||
33 | sheet_json_schema['type'] = 'object' | ||
34 | sheet_json_schema['additionalProperties'] = False | ||
35 | sheet_json_schema = { | 33 | sheet_json_schema = { |
36 | 'type': 'object', | 34 | 'type': 'object', |
37 | 'additionalProperties': False, | 35 | 'additionalProperties': False, |
@@ -89,42 +87,66 @@ def get_sheet_schema_columns(sheet): | |||
89 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType | 87 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType |
90 | # | 88 | # |
91 | column_format = None # Default | 89 | column_format = None # Default |
92 | # column_multiple_of = None # Default | ||
93 | if column_effective_value_type == 'stringValue': | 90 | if column_effective_value_type == 'stringValue': |
94 | column_type = ['null', 'string'] | 91 | col_properties = {'type': ['null', 'string']} |
95 | column_gs_type = 'stringValue' | 92 | column_gs_type = 'stringValue' |
96 | elif column_effective_value_type == 'boolValue': | 93 | elif column_effective_value_type == 'boolValue': |
97 | column_type = ['null', 'boolean', 'string'] | 94 | col_properties = {'type': ['null', 'boolean', 'string']} |
98 | column_gs_type = 'boolValue' | 95 | column_gs_type = 'boolValue' |
99 | elif column_effective_value_type == 'numberValue': | 96 | elif column_effective_value_type == 'numberValue': |
100 | if column_number_format_type == 'DATE_TIME': | 97 | if column_number_format_type == 'DATE_TIME': |
101 | column_type = ['null', 'string'] | 98 | col_properties = { |
102 | column_format = 'date-time' | 99 | 'type': ['null', 'string'], |
100 | 'format': 'date-time' | ||
101 | } | ||
103 | column_gs_type = 'numberType.DATE_TIME' | 102 | column_gs_type = 'numberType.DATE_TIME' |
104 | elif column_number_format_type == 'DATE': | 103 | elif column_number_format_type == 'DATE': |
105 | column_type = ['null', 'string'] | 104 | col_properties = { |
106 | column_format = 'date' | 105 | 'type': ['null', 'string'], |
106 | 'format': 'date' | ||
107 | } | ||
107 | column_gs_type = 'numberType.DATE' | 108 | column_gs_type = 'numberType.DATE' |
108 | elif column_number_format_type == 'TIME': | 109 | elif column_number_format_type == 'TIME': |
109 | column_type = ['null', 'string'] | 110 | col_properties = { |
110 | column_format = 'time' | 111 | 'type': ['null', 'string'], |
112 | 'format': 'time' | ||
113 | } | ||
111 | column_gs_type = 'numberType.TIME' | 114 | column_gs_type = 'numberType.TIME' |
112 | elif column_number_format_type == 'TEXT': | 115 | elif column_number_format_type == 'TEXT': |
113 | column_type = ['null', 'string'] | 116 | col_properties = {'type': ['null', 'string']} |
114 | column_gs_type = 'stringValue' | 117 | column_gs_type = 'stringValue' |
115 | else: | 118 | else: |
116 | column_type = ['null', 'number', 'string'] | 119 | # Interesting - order in the anyOf makes a difference. |
120 | # Number w/ multipleOf must be listed last, otherwise errors occur. | ||
121 | col_properties = { | ||
122 | 'anyOf': [ | ||
123 | { | ||
124 | 'type': 'string' | ||
125 | }, | ||
126 | { | ||
127 | 'type': 'null' | ||
128 | }, | ||
129 | { | ||
130 | 'type': 'number', | ||
131 | 'multipleOf': 1e-15 | ||
132 | } | ||
133 | ] | ||
134 | } | ||
117 | column_gs_type = 'numberType' | 135 | column_gs_type = 'numberType' |
118 | elif column_effective_value_type in ('formulaValue', 'errorValue'): | 136 | # Catch-all to deal with other types and set to string |
119 | raise Exception('INVALID DATA TYPE ERROR: {}, value: {}'.format(column_name, \ | 137 | # column_effective_value_type: formulaValue, errorValue, or other |
138 | else: | ||
139 | col_properties = {'type': ['null', 'string']} | ||
140 | column_gs_type = 'unsupportedValue' | ||
141 | LOGGER.info('Unsupported data type: {}, value: {}'.format(column_name, \ | ||
120 | column_effective_value_type)) | 142 | column_effective_value_type)) |
143 | LOGGER.info('Converting to string.') | ||
121 | else: # skipped | 144 | else: # skipped |
122 | column_is_skipped = True | 145 | column_is_skipped = True |
123 | skipped = skipped + 1 | 146 | skipped = skipped + 1 |
124 | column_index_str = str(column_index).zfill(2) | 147 | column_index_str = str(column_index).zfill(2) |
125 | column_name = '__sdc_skip_col_{}'.format(column_index_str) | 148 | column_name = '__sdc_skip_col_{}'.format(column_index_str) |
126 | column_type = ['null', 'string'] | 149 | col_properties = {'type': ['null', 'string']} |
127 | column_format = None | ||
128 | column_gs_type = 'stringValue' | 150 | column_gs_type = 'stringValue' |
129 | 151 | ||
130 | if skipped >= 2: | 152 | if skipped >= 2: |
@@ -144,10 +166,7 @@ def get_sheet_schema_columns(sheet): | |||
144 | } | 166 | } |
145 | columns.append(column) | 167 | columns.append(column) |
146 | 168 | ||
147 | sheet_json_schema['properties'][column_name] = column | 169 | sheet_json_schema['properties'][column_name] = col_properties |
148 | sheet_json_schema['properties'][column_name]['type'] = column_type | ||
149 | if column_format: | ||
150 | sheet_json_schema['properties'][column_name]['format'] = column_format | ||
151 | 170 | ||
152 | prior_header = column_name | 171 | prior_header = column_name |
153 | i = i + 1 | 172 | i = i + 1 |
@@ -155,6 +174,10 @@ def get_sheet_schema_columns(sheet): | |||
155 | return sheet_json_schema, columns | 174 | return sheet_json_schema, columns |
156 | 175 | ||
157 | 176 | ||
177 | # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query | ||
178 | # endpoint: spreadsheets/{spreadsheet_id} | ||
179 | # params: includeGridData = true, ranges = '{sheet_title}'!1:2 | ||
180 | # This endpoint includes detailed metadata about each cell - incl. data type, formatting, etc. | ||
158 | def get_sheet_metadata(sheet, spreadsheet_id, client): | 181 | def get_sheet_metadata(sheet, spreadsheet_id, client): |
159 | sheet_id = sheet.get('properties', {}).get('sheetId') | 182 | sheet_id = sheet.get('properties', {}).get('sheetId') |
160 | sheet_title = sheet.get('properties', {}).get('title') | 183 | sheet_title = sheet.get('properties', {}).get('title') |
@@ -170,10 +193,13 @@ def get_sheet_metadata(sheet, spreadsheet_id, client): | |||
170 | spreadsheet_id), querystring) | 193 | spreadsheet_id), querystring) |
171 | 194 | ||
172 | sheet_md_results = client.get(path=path, api=api, endpoint=stream_name) | 195 | sheet_md_results = client.get(path=path, api=api, endpoint=stream_name) |
173 | sheet_cols = sheet_md_results.get('sheets')[0] | 196 | # sheet_metadata: 1st `sheets` node in results |
174 | sheet_schema, columns = get_sheet_schema_columns(sheet_cols) | 197 | sheet_metadata = sheet_md_results.get('sheets')[0] |
175 | 198 | ||
176 | return sheet_schema, columns | 199 | # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results) |
200 | sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata) | ||
201 | |||
202 | return sheet_json_schema, columns | ||
177 | 203 | ||
178 | 204 | ||
179 | def get_abs_path(path): | 205 | def get_abs_path(path): |
@@ -209,20 +235,23 @@ def get_schemas(client, spreadsheet_id): | |||
209 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ | 235 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ |
210 | spreadsheet_id), querystring) | 236 | spreadsheet_id), querystring) |
211 | 237 | ||
238 | # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet) | ||
212 | spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \ | 239 | spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \ |
213 | endpoint=stream_name) | 240 | endpoint=stream_name) |
214 | 241 | ||
215 | sheets = spreadsheet_md_results.get('sheets') | 242 | sheets = spreadsheet_md_results.get('sheets') |
216 | if sheets: | 243 | if sheets: |
244 | # Loop thru each worksheet in spreadsheet | ||
217 | for sheet in sheets: | 245 | for sheet in sheets: |
218 | sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) | 246 | # GET sheet_json_schema for each worksheet (from function above) |
247 | sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) | ||
219 | LOGGER.info('columns = {}'.format(columns)) | 248 | LOGGER.info('columns = {}'.format(columns)) |
220 | 249 | ||
221 | sheet_title = sheet.get('properties', {}).get('title') | 250 | sheet_title = sheet.get('properties', {}).get('title') |
222 | schemas[sheet_title] = sheet_schema | 251 | schemas[sheet_title] = sheet_json_schema |
223 | sheet_mdata = metadata.new() | 252 | sheet_mdata = metadata.new() |
224 | sheet_mdata = metadata.get_standard_metadata( | 253 | sheet_mdata = metadata.get_standard_metadata( |
225 | schema=sheet_schema, | 254 | schema=sheet_json_schema, |
226 | key_properties=['__sdc_row'], | 255 | key_properties=['__sdc_row'], |
227 | valid_replication_keys=None, | 256 | valid_replication_keys=None, |
228 | replication_method='FULL_TABLE' | 257 | replication_method='FULL_TABLE' |