diff options
author | Jeff Huth <39202799+jeffhuth-bytecode@users.noreply.github.com> | 2020-02-24 09:53:26 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-02-24 12:53:26 -0500 |
commit | 376f1145837541d4fff2ad0e499236761f9873c3 (patch) | |
tree | cc086f18b24bda8a86c16c3ec742b89947f382ae /tap_google_sheets/schema.py | |
parent | f1d1d43c6b74a8705e91e908c582e39c68464c0c (diff) | |
download | tap-google-sheets-376f1145837541d4fff2ad0e499236761f9873c3.tar.gz tap-google-sheets-376f1145837541d4fff2ad0e499236761f9873c3.tar.zst tap-google-sheets-376f1145837541d4fff2ad0e499236761f9873c3.zip |
v.0.0.4 Logic to skip empty sheets (#4)v0.0.4
* v.0.0.2 schema and sync changes
Change number json schema to anyOf with multipleOf; skip empty rows; move write_bookmark to end of sync.py
* v.0.0.3 Sync activate version and error handling
Update README.md documentation. Improved logging and handling of errors and warnings. Better null handling in Discovery and Sync. Fix issues with activate version messages.
* v.0.0.4 Skip empty worksheets
Add logic to skip empty worksheets in Discovery and Sync mode.
* schema.py fix number datatype issue
Nomber datatypes are being created as strings in targets. The JSON schema order needs to be adjusted so that order is null, number, string.
Diffstat (limited to 'tap_google_sheets/schema.py')
-rw-r--r-- | tap_google_sheets/schema.py | 365 |
1 files changed, 185 insertions, 180 deletions
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py index e319c03..c229d72 100644 --- a/tap_google_sheets/schema.py +++ b/tap_google_sheets/schema.py | |||
@@ -25,184 +25,188 @@ def get_sheet_schema_columns(sheet): | |||
25 | sheet_json_schema = OrderedDict() | 25 | sheet_json_schema = OrderedDict() |
26 | data = next(iter(sheet.get('data', [])), {}) | 26 | data = next(iter(sheet.get('data', [])), {}) |
27 | row_data = data.get('rowData', []) | 27 | row_data = data.get('rowData', []) |
28 | # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse | 28 | if row_data == []: |
29 | 29 | # Empty sheet, SKIP | |
30 | headers = row_data[0].get('values', []) | 30 | LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title)) |
31 | first_values = row_data[1].get('values', []) | 31 | return None, None |
32 | # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) | 32 | else: |
33 | 33 | # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse | |
34 | sheet_json_schema = { | 34 | headers = row_data[0].get('values', []) |
35 | 'type': 'object', | 35 | first_values = row_data[1].get('values', []) |
36 | 'additionalProperties': False, | 36 | # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) |
37 | 'properties': { | 37 | |
38 | '__sdc_spreadsheet_id': { | 38 | sheet_json_schema = { |
39 | 'type': ['null', 'string'] | 39 | 'type': 'object', |
40 | }, | 40 | 'additionalProperties': False, |
41 | '__sdc_sheet_id': { | 41 | 'properties': { |
42 | 'type': ['null', 'integer'] | 42 | '__sdc_spreadsheet_id': { |
43 | }, | 43 | 'type': ['null', 'string'] |
44 | '__sdc_row': { | 44 | }, |
45 | 'type': ['null', 'integer'] | 45 | '__sdc_sheet_id': { |
46 | 'type': ['null', 'integer'] | ||
47 | }, | ||
48 | '__sdc_row': { | ||
49 | 'type': ['null', 'integer'] | ||
50 | } | ||
46 | } | 51 | } |
47 | } | 52 | } |
48 | } | 53 | |
49 | 54 | header_list = [] # used for checking uniqueness | |
50 | header_list = [] # used for checking uniqueness | 55 | columns = [] |
51 | columns = [] | 56 | prior_header = None |
52 | prior_header = None | 57 | i = 0 |
53 | i = 0 | 58 | skipped = 0 |
54 | skipped = 0 | 59 | # Read column headers until end or 2 consecutive skipped headers |
55 | # Read column headers until end or 2 consecutive skipped headers | 60 | for header in headers: |
56 | for header in headers: | 61 | # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True))) |
57 | # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True))) | 62 | column_index = i + 1 |
58 | column_index = i + 1 | 63 | column_letter = colnum_string(column_index) |
59 | column_letter = colnum_string(column_index) | 64 | header_value = header.get('formattedValue') |
60 | header_value = header.get('formattedValue') | 65 | if header_value: # NOT skipped |
61 | if header_value: # NOT skipped | 66 | column_is_skipped = False |
62 | column_is_skipped = False | 67 | skipped = 0 |
63 | skipped = 0 | 68 | column_name = '{}'.format(header_value) |
64 | column_name = '{}'.format(header_value) | 69 | if column_name in header_list: |
65 | if column_name in header_list: | 70 | raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format( |
66 | raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format( | ||
67 | sheet_title, column_name, column_letter)) | ||
68 | header_list.append(column_name) | ||
69 | |||
70 | first_value = None | ||
71 | try: | ||
72 | first_value = first_values[i] | ||
73 | except IndexError as err: | ||
74 | raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format( | ||
75 | sheet_title, column_name, column_letter, err)) | ||
76 | |||
77 | column_effective_value = first_value.get('effectiveValue', {}) | ||
78 | |||
79 | col_val = None | ||
80 | if column_effective_value == {}: | ||
81 | column_effective_value_type = 'stringValue' | ||
82 | LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format( | ||
83 | sheet_title, column_name, column_letter)) | ||
84 | LOGGER.info(' Setting column datatype to STRING') | ||
85 | else: | ||
86 | for key, val in column_effective_value.items(): | ||
87 | if key in ('numberValue', 'stringValue', 'boolValue'): | ||
88 | column_effective_value_type = key | ||
89 | col_val = str(val) | ||
90 | elif key in ('errorType', 'formulaType'): | ||
91 | col_val = str(val) | ||
92 | raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( | ||
93 | sheet_title, column_name, column_letter, key, col_val)) | ||
94 | |||
95 | column_number_format = first_values[i].get('effectiveFormat', {}).get( | ||
96 | 'numberFormat', {}) | ||
97 | column_number_format_type = column_number_format.get('type') | ||
98 | |||
99 | # Determine datatype for sheet_json_schema | ||
100 | # | ||
101 | # column_effective_value_type = numberValue, stringValue, boolValue; | ||
102 | # INVALID: errorType, formulaType | ||
103 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue | ||
104 | # | ||
105 | # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, | ||
106 | # TIME, DATE_TIME, SCIENTIFIC | ||
107 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType | ||
108 | # | ||
109 | column_format = None # Default | ||
110 | if column_effective_value == {}: | ||
111 | col_properties = {'type': ['null', 'string']} | ||
112 | column_gs_type = 'stringValue' | ||
113 | LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format( | ||
114 | sheet_title, column_name, column_letter)) | 71 | sheet_title, column_name, column_letter)) |
115 | LOGGER.info(' Setting column datatype to STRING') | 72 | header_list.append(column_name) |
116 | elif column_effective_value_type == 'stringValue': | 73 | |
117 | col_properties = {'type': ['null', 'string']} | 74 | first_value = None |
118 | column_gs_type = 'stringValue' | 75 | try: |
119 | elif column_effective_value_type == 'boolValue': | 76 | first_value = first_values[i] |
120 | col_properties = {'type': ['null', 'boolean', 'string']} | 77 | except IndexError as err: |
121 | column_gs_type = 'boolValue' | 78 | raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format( |
122 | elif column_effective_value_type == 'numberValue': | 79 | sheet_title, column_name, column_letter, err)) |
123 | if column_number_format_type == 'DATE_TIME': | 80 | |
124 | col_properties = { | 81 | column_effective_value = first_value.get('effectiveValue', {}) |
125 | 'type': ['null', 'string'], | 82 | |
126 | 'format': 'date-time' | 83 | col_val = None |
127 | } | 84 | if column_effective_value == {}: |
128 | column_gs_type = 'numberType.DATE_TIME' | 85 | column_effective_value_type = 'stringValue' |
129 | elif column_number_format_type == 'DATE': | 86 | LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format( |
130 | col_properties = { | 87 | sheet_title, column_name, column_letter)) |
131 | 'type': ['null', 'string'], | 88 | LOGGER.info(' Setting column datatype to STRING') |
132 | 'format': 'date' | 89 | else: |
133 | } | 90 | for key, val in column_effective_value.items(): |
134 | column_gs_type = 'numberType.DATE' | 91 | if key in ('numberValue', 'stringValue', 'boolValue'): |
135 | elif column_number_format_type == 'TIME': | 92 | column_effective_value_type = key |
136 | col_properties = { | 93 | col_val = str(val) |
137 | 'type': ['null', 'string'], | 94 | elif key in ('errorType', 'formulaType'): |
138 | 'format': 'time' | 95 | col_val = str(val) |
139 | } | 96 | raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( |
140 | column_gs_type = 'numberType.TIME' | 97 | sheet_title, column_name, column_letter, key, col_val)) |
141 | elif column_number_format_type == 'TEXT': | 98 | |
99 | column_number_format = first_values[i].get('effectiveFormat', {}).get( | ||
100 | 'numberFormat', {}) | ||
101 | column_number_format_type = column_number_format.get('type') | ||
102 | |||
103 | # Determine datatype for sheet_json_schema | ||
104 | # | ||
105 | # column_effective_value_type = numberValue, stringValue, boolValue; | ||
106 | # INVALID: errorType, formulaType | ||
107 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue | ||
108 | # | ||
109 | # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, | ||
110 | # TIME, DATE_TIME, SCIENTIFIC | ||
111 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType | ||
112 | # | ||
113 | column_format = None # Default | ||
114 | if column_effective_value == {}: | ||
115 | col_properties = {'type': ['null', 'string']} | ||
116 | column_gs_type = 'stringValue' | ||
117 | LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format( | ||
118 | sheet_title, column_name, column_letter)) | ||
119 | LOGGER.info(' Setting column datatype to STRING') | ||
120 | elif column_effective_value_type == 'stringValue': | ||
142 | col_properties = {'type': ['null', 'string']} | 121 | col_properties = {'type': ['null', 'string']} |
143 | column_gs_type = 'stringValue' | 122 | column_gs_type = 'stringValue' |
123 | elif column_effective_value_type == 'boolValue': | ||
124 | col_properties = {'type': ['null', 'boolean', 'string']} | ||
125 | column_gs_type = 'boolValue' | ||
126 | elif column_effective_value_type == 'numberValue': | ||
127 | if column_number_format_type == 'DATE_TIME': | ||
128 | col_properties = { | ||
129 | 'type': ['null', 'string'], | ||
130 | 'format': 'date-time' | ||
131 | } | ||
132 | column_gs_type = 'numberType.DATE_TIME' | ||
133 | elif column_number_format_type == 'DATE': | ||
134 | col_properties = { | ||
135 | 'type': ['null', 'string'], | ||
136 | 'format': 'date' | ||
137 | } | ||
138 | column_gs_type = 'numberType.DATE' | ||
139 | elif column_number_format_type == 'TIME': | ||
140 | col_properties = { | ||
141 | 'type': ['null', 'string'], | ||
142 | 'format': 'time' | ||
143 | } | ||
144 | column_gs_type = 'numberType.TIME' | ||
145 | elif column_number_format_type == 'TEXT': | ||
146 | col_properties = {'type': ['null', 'string']} | ||
147 | column_gs_type = 'stringValue' | ||
148 | else: | ||
149 | # Interesting - order in the anyOf makes a difference. | ||
150 | # Number w/ multipleOf must be listed last, otherwise errors occur. | ||
151 | col_properties = { | ||
152 | 'anyOf': [ | ||
153 | { | ||
154 | 'type': 'null' | ||
155 | }, | ||
156 | { | ||
157 | 'type': 'number', | ||
158 | 'multipleOf': 1e-15 | ||
159 | }, | ||
160 | { | ||
161 | 'type': 'string' | ||
162 | } | ||
163 | ] | ||
164 | } | ||
165 | column_gs_type = 'numberType' | ||
166 | # Catch-all to deal with other types and set to string | ||
167 | # column_effective_value_type: formulaValue, errorValue, or other | ||
144 | else: | 168 | else: |
145 | # Interesting - order in the anyOf makes a difference. | 169 | col_properties = {'type': ['null', 'string']} |
146 | # Number w/ multipleOf must be listed last, otherwise errors occur. | 170 | column_gs_type = 'unsupportedValue' |
147 | col_properties = { | 171 | LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( |
148 | 'anyOf': [ | 172 | sheet_title, column_name, column_letter, column_effective_value_type, col_val)) |
149 | { | 173 | LOGGER.info('Converting to string.') |
150 | 'type': 'string' | 174 | else: # skipped |
151 | }, | 175 | column_is_skipped = True |
152 | { | 176 | skipped = skipped + 1 |
153 | 'type': 'null' | 177 | column_index_str = str(column_index).zfill(2) |
154 | }, | 178 | column_name = '__sdc_skip_col_{}'.format(column_index_str) |
155 | { | ||
156 | 'type': 'number', | ||
157 | 'multipleOf': 1e-15 | ||
158 | } | ||
159 | ] | ||
160 | } | ||
161 | column_gs_type = 'numberType' | ||
162 | # Catch-all to deal with other types and set to string | ||
163 | # column_effective_value_type: formulaValue, errorValue, or other | ||
164 | else: | ||
165 | col_properties = {'type': ['null', 'string']} | 179 | col_properties = {'type': ['null', 'string']} |
166 | column_gs_type = 'unsupportedValue' | 180 | column_gs_type = 'stringValue' |
167 | LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( | 181 | LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format( |
168 | sheet_title, column_name, column_letter, column_effective_value_type, col_val)) | 182 | sheet_title, column_name, column_letter)) |
169 | LOGGER.info('Converting to string.') | 183 | LOGGER.info(' This column will be skipped during data loading.') |
170 | else: # skipped | ||
171 | column_is_skipped = True | ||
172 | skipped = skipped + 1 | ||
173 | column_index_str = str(column_index).zfill(2) | ||
174 | column_name = '__sdc_skip_col_{}'.format(column_index_str) | ||
175 | col_properties = {'type': ['null', 'string']} | ||
176 | column_gs_type = 'stringValue' | ||
177 | LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format( | ||
178 | sheet_title, column_name, column_letter)) | ||
179 | LOGGER.info(' This column will be skipped during data loading.') | ||
180 | |||
181 | if skipped >= 2: | ||
182 | # skipped = 2 consecutive skipped headers | ||
183 | # Remove prior_header column_name | ||
184 | sheet_json_schema['properties'].pop(prior_header, None) | ||
185 | LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format( | ||
186 | sheet_title, column_name, column_letter)) | ||
187 | break | ||
188 | |||
189 | else: | ||
190 | column = {} | ||
191 | column = { | ||
192 | 'columnIndex': column_index, | ||
193 | 'columnLetter': column_letter, | ||
194 | 'columnName': column_name, | ||
195 | 'columnType': column_gs_type, | ||
196 | 'columnSkipped': column_is_skipped | ||
197 | } | ||
198 | columns.append(column) | ||
199 | 184 | ||
200 | sheet_json_schema['properties'][column_name] = col_properties | 185 | if skipped >= 2: |
186 | # skipped = 2 consecutive skipped headers | ||
187 | # Remove prior_header column_name | ||
188 | sheet_json_schema['properties'].pop(prior_header, None) | ||
189 | LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format( | ||
190 | sheet_title, column_name, column_letter)) | ||
191 | break | ||
201 | 192 | ||
202 | prior_header = column_name | 193 | else: |
203 | i = i + 1 | 194 | column = {} |
195 | column = { | ||
196 | 'columnIndex': column_index, | ||
197 | 'columnLetter': column_letter, | ||
198 | 'columnName': column_name, | ||
199 | 'columnType': column_gs_type, | ||
200 | 'columnSkipped': column_is_skipped | ||
201 | } | ||
202 | columns.append(column) | ||
204 | 203 | ||
205 | return sheet_json_schema, columns | 204 | sheet_json_schema['properties'][column_name] = col_properties |
205 | |||
206 | prior_header = column_name | ||
207 | i = i + 1 | ||
208 | |||
209 | return sheet_json_schema, columns | ||
206 | 210 | ||
207 | 211 | ||
208 | # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query | 212 | # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query |
@@ -276,17 +280,18 @@ def get_schemas(client, spreadsheet_id): | |||
276 | for sheet in sheets: | 280 | for sheet in sheets: |
277 | # GET sheet_json_schema for each worksheet (from function above) | 281 | # GET sheet_json_schema for each worksheet (from function above) |
278 | sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) | 282 | sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) |
279 | # LOGGER.info('columns = {}'.format(columns)) | 283 | |
280 | 284 | # SKIP empty sheets (where sheet_json_schema and columns are None) | |
281 | sheet_title = sheet.get('properties', {}).get('title') | 285 | if sheet_json_schema and columns: |
282 | schemas[sheet_title] = sheet_json_schema | 286 | sheet_title = sheet.get('properties', {}).get('title') |
283 | sheet_mdata = metadata.new() | 287 | schemas[sheet_title] = sheet_json_schema |
284 | sheet_mdata = metadata.get_standard_metadata( | 288 | sheet_mdata = metadata.new() |
285 | schema=sheet_json_schema, | 289 | sheet_mdata = metadata.get_standard_metadata( |
286 | key_properties=['__sdc_row'], | 290 | schema=sheet_json_schema, |
287 | valid_replication_keys=None, | 291 | key_properties=['__sdc_row'], |
288 | replication_method='FULL_TABLE' | 292 | valid_replication_keys=None, |
289 | ) | 293 | replication_method='FULL_TABLE' |
290 | field_metadata[sheet_title] = sheet_mdata | 294 | ) |
295 | field_metadata[sheet_title] = sheet_mdata | ||
291 | 296 | ||
292 | return schemas, field_metadata | 297 | return schemas, field_metadata |