diff options
Diffstat (limited to 'tap_google_sheets/schema.py')
-rw-r--r-- | tap_google_sheets/schema.py | 365 |
1 files changed, 185 insertions, 180 deletions
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py index e319c03..c229d72 100644 --- a/tap_google_sheets/schema.py +++ b/tap_google_sheets/schema.py | |||
@@ -25,184 +25,188 @@ def get_sheet_schema_columns(sheet): | |||
25 | sheet_json_schema = OrderedDict() | 25 | sheet_json_schema = OrderedDict() |
26 | data = next(iter(sheet.get('data', [])), {}) | 26 | data = next(iter(sheet.get('data', [])), {}) |
27 | row_data = data.get('rowData', []) | 27 | row_data = data.get('rowData', []) |
28 | # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse | 28 | if row_data == []: |
29 | 29 | # Empty sheet, SKIP | |
30 | headers = row_data[0].get('values', []) | 30 | LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title)) |
31 | first_values = row_data[1].get('values', []) | 31 | return None, None |
32 | # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) | 32 | else: |
33 | 33 | # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse | |
34 | sheet_json_schema = { | 34 | headers = row_data[0].get('values', []) |
35 | 'type': 'object', | 35 | first_values = row_data[1].get('values', []) |
36 | 'additionalProperties': False, | 36 | # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) |
37 | 'properties': { | 37 | |
38 | '__sdc_spreadsheet_id': { | 38 | sheet_json_schema = { |
39 | 'type': ['null', 'string'] | 39 | 'type': 'object', |
40 | }, | 40 | 'additionalProperties': False, |
41 | '__sdc_sheet_id': { | 41 | 'properties': { |
42 | 'type': ['null', 'integer'] | 42 | '__sdc_spreadsheet_id': { |
43 | }, | 43 | 'type': ['null', 'string'] |
44 | '__sdc_row': { | 44 | }, |
45 | 'type': ['null', 'integer'] | 45 | '__sdc_sheet_id': { |
46 | 'type': ['null', 'integer'] | ||
47 | }, | ||
48 | '__sdc_row': { | ||
49 | 'type': ['null', 'integer'] | ||
50 | } | ||
46 | } | 51 | } |
47 | } | 52 | } |
48 | } | 53 | |
49 | 54 | header_list = [] # used for checking uniqueness | |
50 | header_list = [] # used for checking uniqueness | 55 | columns = [] |
51 | columns = [] | 56 | prior_header = None |
52 | prior_header = None | 57 | i = 0 |
53 | i = 0 | 58 | skipped = 0 |
54 | skipped = 0 | 59 | # Read column headers until end or 2 consecutive skipped headers |
55 | # Read column headers until end or 2 consecutive skipped headers | 60 | for header in headers: |
56 | for header in headers: | 61 | # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True))) |
57 | # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True))) | 62 | column_index = i + 1 |
58 | column_index = i + 1 | 63 | column_letter = colnum_string(column_index) |
59 | column_letter = colnum_string(column_index) | 64 | header_value = header.get('formattedValue') |
60 | header_value = header.get('formattedValue') | 65 | if header_value: # NOT skipped |
61 | if header_value: # NOT skipped | 66 | column_is_skipped = False |
62 | column_is_skipped = False | 67 | skipped = 0 |
63 | skipped = 0 | 68 | column_name = '{}'.format(header_value) |
64 | column_name = '{}'.format(header_value) | 69 | if column_name in header_list: |
65 | if column_name in header_list: | 70 | raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format( |
66 | raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format( | ||
67 | sheet_title, column_name, column_letter)) | ||
68 | header_list.append(column_name) | ||
69 | |||
70 | first_value = None | ||
71 | try: | ||
72 | first_value = first_values[i] | ||
73 | except IndexError as err: | ||
74 | raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format( | ||
75 | sheet_title, column_name, column_letter, err)) | ||
76 | |||
77 | column_effective_value = first_value.get('effectiveValue', {}) | ||
78 | |||
79 | col_val = None | ||
80 | if column_effective_value == {}: | ||
81 | column_effective_value_type = 'stringValue' | ||
82 | LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format( | ||
83 | sheet_title, column_name, column_letter)) | ||
84 | LOGGER.info(' Setting column datatype to STRING') | ||
85 | else: | ||
86 | for key, val in column_effective_value.items(): | ||
87 | if key in ('numberValue', 'stringValue', 'boolValue'): | ||
88 | column_effective_value_type = key | ||
89 | col_val = str(val) | ||
90 | elif key in ('errorType', 'formulaType'): | ||
91 | col_val = str(val) | ||
92 | raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( | ||
93 | sheet_title, column_name, column_letter, key, col_val)) | ||
94 | |||
95 | column_number_format = first_values[i].get('effectiveFormat', {}).get( | ||
96 | 'numberFormat', {}) | ||
97 | column_number_format_type = column_number_format.get('type') | ||
98 | |||
99 | # Determine datatype for sheet_json_schema | ||
100 | # | ||
101 | # column_effective_value_type = numberValue, stringValue, boolValue; | ||
102 | # INVALID: errorType, formulaType | ||
103 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue | ||
104 | # | ||
105 | # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, | ||
106 | # TIME, DATE_TIME, SCIENTIFIC | ||
107 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType | ||
108 | # | ||
109 | column_format = None # Default | ||
110 | if column_effective_value == {}: | ||
111 | col_properties = {'type': ['null', 'string']} | ||
112 | column_gs_type = 'stringValue' | ||
113 | LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format( | ||
114 | sheet_title, column_name, column_letter)) | 71 | sheet_title, column_name, column_letter)) |
115 | LOGGER.info(' Setting column datatype to STRING') | 72 | header_list.append(column_name) |
116 | elif column_effective_value_type == 'stringValue': | 73 | |
117 | col_properties = {'type': ['null', 'string']} | 74 | first_value = None |
118 | column_gs_type = 'stringValue' | 75 | try: |
119 | elif column_effective_value_type == 'boolValue': | 76 | first_value = first_values[i] |
120 | col_properties = {'type': ['null', 'boolean', 'string']} | 77 | except IndexError as err: |
121 | column_gs_type = 'boolValue' | 78 | raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format( |
122 | elif column_effective_value_type == 'numberValue': | 79 | sheet_title, column_name, column_letter, err)) |
123 | if column_number_format_type == 'DATE_TIME': | 80 | |
124 | col_properties = { | 81 | column_effective_value = first_value.get('effectiveValue', {}) |
125 | 'type': ['null', 'string'], | 82 | |
126 | 'format': 'date-time' | 83 | col_val = None |
127 | } | 84 | if column_effective_value == {}: |
128 | column_gs_type = 'numberType.DATE_TIME' | 85 | column_effective_value_type = 'stringValue' |
129 | elif column_number_format_type == 'DATE': | 86 | LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format( |
130 | col_properties = { | 87 | sheet_title, column_name, column_letter)) |
131 | 'type': ['null', 'string'], | 88 | LOGGER.info(' Setting column datatype to STRING') |
132 | 'format': 'date' | 89 | else: |
133 | } | 90 | for key, val in column_effective_value.items(): |
134 | column_gs_type = 'numberType.DATE' | 91 | if key in ('numberValue', 'stringValue', 'boolValue'): |
135 | elif column_number_format_type == 'TIME': | 92 | column_effective_value_type = key |
136 | col_properties = { | 93 | col_val = str(val) |
137 | 'type': ['null', 'string'], | 94 | elif key in ('errorType', 'formulaType'): |
138 | 'format': 'time' | 95 | col_val = str(val) |
139 | } | 96 | raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( |
140 | column_gs_type = 'numberType.TIME' | 97 | sheet_title, column_name, column_letter, key, col_val)) |
141 | elif column_number_format_type == 'TEXT': | 98 | |
99 | column_number_format = first_values[i].get('effectiveFormat', {}).get( | ||
100 | 'numberFormat', {}) | ||
101 | column_number_format_type = column_number_format.get('type') | ||
102 | |||
103 | # Determine datatype for sheet_json_schema | ||
104 | # | ||
105 | # column_effective_value_type = numberValue, stringValue, boolValue; | ||
106 | # INVALID: errorType, formulaType | ||
107 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue | ||
108 | # | ||
109 | # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, | ||
110 | # TIME, DATE_TIME, SCIENTIFIC | ||
111 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType | ||
112 | # | ||
113 | column_format = None # Default | ||
114 | if column_effective_value == {}: | ||
115 | col_properties = {'type': ['null', 'string']} | ||
116 | column_gs_type = 'stringValue' | ||
117 | LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format( | ||
118 | sheet_title, column_name, column_letter)) | ||
119 | LOGGER.info(' Setting column datatype to STRING') | ||
120 | elif column_effective_value_type == 'stringValue': | ||
142 | col_properties = {'type': ['null', 'string']} | 121 | col_properties = {'type': ['null', 'string']} |
143 | column_gs_type = 'stringValue' | 122 | column_gs_type = 'stringValue' |
123 | elif column_effective_value_type == 'boolValue': | ||
124 | col_properties = {'type': ['null', 'boolean', 'string']} | ||
125 | column_gs_type = 'boolValue' | ||
126 | elif column_effective_value_type == 'numberValue': | ||
127 | if column_number_format_type == 'DATE_TIME': | ||
128 | col_properties = { | ||
129 | 'type': ['null', 'string'], | ||
130 | 'format': 'date-time' | ||
131 | } | ||
132 | column_gs_type = 'numberType.DATE_TIME' | ||
133 | elif column_number_format_type == 'DATE': | ||
134 | col_properties = { | ||
135 | 'type': ['null', 'string'], | ||
136 | 'format': 'date' | ||
137 | } | ||
138 | column_gs_type = 'numberType.DATE' | ||
139 | elif column_number_format_type == 'TIME': | ||
140 | col_properties = { | ||
141 | 'type': ['null', 'string'], | ||
142 | 'format': 'time' | ||
143 | } | ||
144 | column_gs_type = 'numberType.TIME' | ||
145 | elif column_number_format_type == 'TEXT': | ||
146 | col_properties = {'type': ['null', 'string']} | ||
147 | column_gs_type = 'stringValue' | ||
148 | else: | ||
149 | # Interesting - order in the anyOf makes a difference. | ||
150 | # Number w/ multipleOf must be listed last, otherwise errors occur. | ||
151 | col_properties = { | ||
152 | 'anyOf': [ | ||
153 | { | ||
154 | 'type': 'null' | ||
155 | }, | ||
156 | { | ||
157 | 'type': 'number', | ||
158 | 'multipleOf': 1e-15 | ||
159 | }, | ||
160 | { | ||
161 | 'type': 'string' | ||
162 | } | ||
163 | ] | ||
164 | } | ||
165 | column_gs_type = 'numberType' | ||
166 | # Catch-all to deal with other types and set to string | ||
167 | # column_effective_value_type: formulaValue, errorValue, or other | ||
144 | else: | 168 | else: |
145 | # Interesting - order in the anyOf makes a difference. | 169 | col_properties = {'type': ['null', 'string']} |
146 | # Number w/ multipleOf must be listed last, otherwise errors occur. | 170 | column_gs_type = 'unsupportedValue' |
147 | col_properties = { | 171 | LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( |
148 | 'anyOf': [ | 172 | sheet_title, column_name, column_letter, column_effective_value_type, col_val)) |
149 | { | 173 | LOGGER.info('Converting to string.') |
150 | 'type': 'string' | 174 | else: # skipped |
151 | }, | 175 | column_is_skipped = True |
152 | { | 176 | skipped = skipped + 1 |
153 | 'type': 'null' | 177 | column_index_str = str(column_index).zfill(2) |
154 | }, | 178 | column_name = '__sdc_skip_col_{}'.format(column_index_str) |
155 | { | ||
156 | 'type': 'number', | ||
157 | 'multipleOf': 1e-15 | ||
158 | } | ||
159 | ] | ||
160 | } | ||
161 | column_gs_type = 'numberType' | ||
162 | # Catch-all to deal with other types and set to string | ||
163 | # column_effective_value_type: formulaValue, errorValue, or other | ||
164 | else: | ||
165 | col_properties = {'type': ['null', 'string']} | 179 | col_properties = {'type': ['null', 'string']} |
166 | column_gs_type = 'unsupportedValue' | 180 | column_gs_type = 'stringValue' |
167 | LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( | 181 | LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format( |
168 | sheet_title, column_name, column_letter, column_effective_value_type, col_val)) | 182 | sheet_title, column_name, column_letter)) |
169 | LOGGER.info('Converting to string.') | 183 | LOGGER.info(' This column will be skipped during data loading.') |
170 | else: # skipped | ||
171 | column_is_skipped = True | ||
172 | skipped = skipped + 1 | ||
173 | column_index_str = str(column_index).zfill(2) | ||
174 | column_name = '__sdc_skip_col_{}'.format(column_index_str) | ||
175 | col_properties = {'type': ['null', 'string']} | ||
176 | column_gs_type = 'stringValue' | ||
177 | LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format( | ||
178 | sheet_title, column_name, column_letter)) | ||
179 | LOGGER.info(' This column will be skipped during data loading.') | ||
180 | |||
181 | if skipped >= 2: | ||
182 | # skipped = 2 consecutive skipped headers | ||
183 | # Remove prior_header column_name | ||
184 | sheet_json_schema['properties'].pop(prior_header, None) | ||
185 | LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format( | ||
186 | sheet_title, column_name, column_letter)) | ||
187 | break | ||
188 | |||
189 | else: | ||
190 | column = {} | ||
191 | column = { | ||
192 | 'columnIndex': column_index, | ||
193 | 'columnLetter': column_letter, | ||
194 | 'columnName': column_name, | ||
195 | 'columnType': column_gs_type, | ||
196 | 'columnSkipped': column_is_skipped | ||
197 | } | ||
198 | columns.append(column) | ||
199 | 184 | ||
200 | sheet_json_schema['properties'][column_name] = col_properties | 185 | if skipped >= 2: |
186 | # skipped = 2 consecutive skipped headers | ||
187 | # Remove prior_header column_name | ||
188 | sheet_json_schema['properties'].pop(prior_header, None) | ||
189 | LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format( | ||
190 | sheet_title, column_name, column_letter)) | ||
191 | break | ||
201 | 192 | ||
202 | prior_header = column_name | 193 | else: |
203 | i = i + 1 | 194 | column = {} |
195 | column = { | ||
196 | 'columnIndex': column_index, | ||
197 | 'columnLetter': column_letter, | ||
198 | 'columnName': column_name, | ||
199 | 'columnType': column_gs_type, | ||
200 | 'columnSkipped': column_is_skipped | ||
201 | } | ||
202 | columns.append(column) | ||
204 | 203 | ||
205 | return sheet_json_schema, columns | 204 | sheet_json_schema['properties'][column_name] = col_properties |
205 | |||
206 | prior_header = column_name | ||
207 | i = i + 1 | ||
208 | |||
209 | return sheet_json_schema, columns | ||
206 | 210 | ||
207 | 211 | ||
208 | # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query | 212 | # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query |
@@ -276,17 +280,18 @@ def get_schemas(client, spreadsheet_id): | |||
276 | for sheet in sheets: | 280 | for sheet in sheets: |
277 | # GET sheet_json_schema for each worksheet (from function above) | 281 | # GET sheet_json_schema for each worksheet (from function above) |
278 | sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) | 282 | sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) |
279 | # LOGGER.info('columns = {}'.format(columns)) | 283 | |
280 | 284 | # SKIP empty sheets (where sheet_json_schema and columns are None) | |
281 | sheet_title = sheet.get('properties', {}).get('title') | 285 | if sheet_json_schema and columns: |
282 | schemas[sheet_title] = sheet_json_schema | 286 | sheet_title = sheet.get('properties', {}).get('title') |
283 | sheet_mdata = metadata.new() | 287 | schemas[sheet_title] = sheet_json_schema |
284 | sheet_mdata = metadata.get_standard_metadata( | 288 | sheet_mdata = metadata.new() |
285 | schema=sheet_json_schema, | 289 | sheet_mdata = metadata.get_standard_metadata( |
286 | key_properties=['__sdc_row'], | 290 | schema=sheet_json_schema, |
287 | valid_replication_keys=None, | 291 | key_properties=['__sdc_row'], |
288 | replication_method='FULL_TABLE' | 292 | valid_replication_keys=None, |
289 | ) | 293 | replication_method='FULL_TABLE' |
290 | field_metadata[sheet_title] = sheet_mdata | 294 | ) |
295 | field_metadata[sheet_title] = sheet_mdata | ||
291 | 296 | ||
292 | return schemas, field_metadata | 297 | return schemas, field_metadata |