diff options
Diffstat (limited to 'tap_google_sheets/schema.py')
-rw-r--r-- | tap_google_sheets/schema.py | 56 |
1 files changed, 30 insertions, 26 deletions
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py index 237ab06..d4fead5 100644 --- a/tap_google_sheets/schema.py +++ b/tap_google_sheets/schema.py | |||
@@ -11,19 +11,19 @@ LOGGER = singer.get_logger() | |||
11 | # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata | 11 | # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata |
12 | 12 | ||
13 | # Convert column index to column letter | 13 | # Convert column index to column letter |
14 | def colnum_string(n): | 14 | def colnum_string(num): |
15 | string = "" | 15 | string = "" |
16 | while n > 0: | 16 | while num > 0: |
17 | n, remainder = divmod(n - 1, 26) | 17 | num, remainder = divmod(num - 1, 26) |
18 | string = chr(65 + remainder) + string | 18 | string = chr(65 + remainder) + string |
19 | return string | 19 | return string |
20 | 20 | ||
21 | 21 | ||
22 | # Create sheet_metadata_json with columns from sheet | 22 | # Create sheet_metadata_json with columns from sheet |
23 | def get_sheet_schema_columns(sheet, spreadsheet_id, client): | 23 | def get_sheet_schema_columns(sheet): |
24 | sheet_json_schema = OrderedDict() | 24 | sheet_json_schema = OrderedDict() |
25 | data = next(iter(sheet.get('data', [])), {}) | 25 | data = next(iter(sheet.get('data', [])), {}) |
26 | row_data = data.get('rowData',[]) | 26 | row_data = data.get('rowData', []) |
27 | # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse | 27 | # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse |
28 | 28 | ||
29 | headers = row_data[0].get('values', []) | 29 | headers = row_data[0].get('values', []) |
@@ -65,33 +65,32 @@ def get_sheet_schema_columns(sheet, spreadsheet_id, client): | |||
65 | column_name = '{}'.format(header_value) | 65 | column_name = '{}'.format(header_value) |
66 | if column_name in header_list: | 66 | if column_name in header_list: |
67 | raise Exception('DUPLICATE HEADER ERROR: {}'.format(column_name)) | 67 | raise Exception('DUPLICATE HEADER ERROR: {}'.format(column_name)) |
68 | else: | 68 | header_list.append(column_name) |
69 | header_list.append(column_name) | ||
70 | 69 | ||
71 | first_value = first_values[i] | 70 | first_value = first_values[i] |
72 | # LOGGER.info('first_value[{}] = {}'.format(i, json.dumps(first_value, indent=2, sort_keys=True))) | ||
73 | 71 | ||
74 | column_effective_value = first_value.get('effectiveValue', {}) | 72 | column_effective_value = first_value.get('effectiveValue', {}) |
75 | for key in column_effective_value.keys(): | 73 | for key in column_effective_value.keys(): |
76 | if key in ('numberValue', 'stringValue', 'boolValue', 'errorType', 'formulaType'): | 74 | if key in ('numberValue', 'stringValue', 'boolValue', 'errorType', 'formulaType'): |
77 | column_effective_value_type = key | 75 | column_effective_value_type = key |
78 | 76 | ||
79 | column_number_format = first_values[i].get('effectiveFormat', {}).get('numberFormat', {}) | 77 | column_number_format = first_values[i].get('effectiveFormat', {}).get( |
78 | 'numberFormat', {}) | ||
80 | column_number_format_type = column_number_format.get('type') | 79 | column_number_format_type = column_number_format.get('type') |
81 | 80 | ||
82 | # Determine datatype for sheet_json_schema | 81 | # Determine datatype for sheet_json_schema |
83 | # | 82 | # |
84 | # column_effective_value_type = numberValue, stringValue, boolValue; INVALID: errorType, formulaType | 83 | # column_effective_value_type = numberValue, stringValue, boolValue; |
85 | # Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue | 84 | # INVALID: errorType, formulaType |
85 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue | ||
86 | # | 86 | # |
87 | # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, TIME, DATE_TIME, SCIENTIFIC | 87 | # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, |
88 | # Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType | 88 | # TIME, DATE_TIME, SCIENTIFIC |
89 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType | ||
89 | # | 90 | # |
90 | column_format = None # Default | 91 | column_format = None # Default |
91 | # column_multiple_of = None # Default | 92 | # column_multiple_of = None # Default |
92 | if column_effective_value_type in ('formulaValue', 'errorValue'): | 93 | if column_effective_value_type == 'stringValue': |
93 | raise Exception('INVALID DATA TYPE ERROR: {}, value: {}'.format(column_name)) | ||
94 | elif column_effective_value_type == 'stringValue': | ||
95 | column_type = ['null', 'string'] | 94 | column_type = ['null', 'string'] |
96 | column_gs_type = 'stringValue' | 95 | column_gs_type = 'stringValue' |
97 | elif column_effective_value_type == 'boolValue': | 96 | elif column_effective_value_type == 'boolValue': |
@@ -116,7 +115,9 @@ def get_sheet_schema_columns(sheet, spreadsheet_id, client): | |||
116 | else: | 115 | else: |
117 | column_type = ['null', 'number', 'string'] | 116 | column_type = ['null', 'number', 'string'] |
118 | column_gs_type = 'numberType' | 117 | column_gs_type = 'numberType' |
119 | 118 | elif column_effective_value_type in ('formulaValue', 'errorValue'): | |
119 | raise Exception('INVALID DATA TYPE ERROR: {}, value: {}'.format(column_name, \ | ||
120 | column_effective_value_type)) | ||
120 | else: # skipped | 121 | else: # skipped |
121 | column_is_skipped = True | 122 | column_is_skipped = True |
122 | skipped = skipped + 1 | 123 | skipped = skipped + 1 |
@@ -130,7 +131,6 @@ def get_sheet_schema_columns(sheet, spreadsheet_id, client): | |||
130 | # skipped = 2 consecutive skipped headers | 131 | # skipped = 2 consecutive skipped headers |
131 | # Remove prior_header column_name | 132 | # Remove prior_header column_name |
132 | sheet_json_schema['properties'].pop(prior_header, None) | 133 | sheet_json_schema['properties'].pop(prior_header, None) |
133 | column_count = i - 1 | ||
134 | break | 134 | break |
135 | 135 | ||
136 | else: | 136 | else: |
@@ -164,12 +164,14 @@ def get_sheet_metadata(sheet, spreadsheet_id, client): | |||
164 | stream_metadata = STREAMS.get(stream_name) | 164 | stream_metadata = STREAMS.get(stream_name) |
165 | api = stream_metadata.get('api', 'sheets') | 165 | api = stream_metadata.get('api', 'sheets') |
166 | params = stream_metadata.get('params', {}) | 166 | params = stream_metadata.get('params', {}) |
167 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]).replace('{sheet_title}', sheet_title) | 167 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \ |
168 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', spreadsheet_id), querystring) | 168 | params.items()]).replace('{sheet_title}', sheet_title) |
169 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ | ||
170 | spreadsheet_id), querystring) | ||
169 | 171 | ||
170 | sheet_md_results = client.get(path=path, api=api, endpoint=stream_name) | 172 | sheet_md_results = client.get(path=path, api=api, endpoint=stream_name) |
171 | sheet_cols = sheet_md_results.get('sheets')[0] | 173 | sheet_cols = sheet_md_results.get('sheets')[0] |
172 | sheet_schema, columns = get_sheet_schema_columns(sheet_cols, spreadsheet_id, client) | 174 | sheet_schema, columns = get_sheet_schema_columns(sheet_cols) |
173 | 175 | ||
174 | return sheet_schema, columns | 176 | return sheet_schema, columns |
175 | 177 | ||
@@ -199,20 +201,22 @@ def get_schemas(client, spreadsheet_id): | |||
199 | replication_method=stream_metadata.get('replication_method', None) | 201 | replication_method=stream_metadata.get('replication_method', None) |
200 | ) | 202 | ) |
201 | field_metadata[stream_name] = mdata | 203 | field_metadata[stream_name] = mdata |
202 | 204 | ||
203 | if stream_name == 'spreadsheet_metadata': | 205 | if stream_name == 'spreadsheet_metadata': |
204 | api = stream_metadata.get('api', 'sheets') | 206 | api = stream_metadata.get('api', 'sheets') |
205 | params = stream_metadata.get('params', {}) | 207 | params = stream_metadata.get('params', {}) |
206 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]) | 208 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]) |
207 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', spreadsheet_id), querystring) | 209 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ |
210 | spreadsheet_id), querystring) | ||
208 | 211 | ||
209 | spreadsheet_md_results = client.get(path=path, params=querystring, api=api, endpoint=stream_name) | 212 | spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \ |
213 | endpoint=stream_name) | ||
210 | 214 | ||
211 | sheets = spreadsheet_md_results.get('sheets') | 215 | sheets = spreadsheet_md_results.get('sheets') |
212 | if sheets: | 216 | if sheets: |
213 | for sheet in sheets: | 217 | for sheet in sheets: |
214 | sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) | 218 | sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) |
215 | # LOGGER.info('sheet_schema = {}'.format(json.dumps(sheet_schema, indent=2, sort_keys=True))) | 219 | LOGGER.info('columns = {}'.format(columns)) |
216 | 220 | ||
217 | sheet_title = sheet.get('properties', {}).get('title') | 221 | sheet_title = sheet.get('properties', {}).get('title') |
218 | schemas[sheet_title] = sheet_schema | 222 | schemas[sheet_title] = sheet_schema |
@@ -224,5 +228,5 @@ def get_schemas(client, spreadsheet_id): | |||
224 | replication_method='FULL_TABLE' | 228 | replication_method='FULL_TABLE' |
225 | ) | 229 | ) |
226 | field_metadata[sheet_title] = sheet_mdata | 230 | field_metadata[sheet_title] = sheet_mdata |
227 | 231 | ||
228 | return schemas, field_metadata | 232 | return schemas, field_metadata |