diff options
author | cosimon <cosimon@users.noreply.github.com> | 2020-04-17 09:09:41 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-04-17 09:09:41 -0400 |
commit | f6d7c509f00feb63810df7a9216fe1769195197a (patch) | |
tree | 0727c5ccc646e49177db4dec43b58df4c02aa2c2 /tap_google_sheets/schema.py | |
parent | f012a137927591d5d9068d5d499d424ea4112ea9 (diff) | |
download | tap-google-sheets-f6d7c509f00feb63810df7a9216fe1769195197a.tar.gz tap-google-sheets-f6d7c509f00feb63810df7a9216fe1769195197a.tar.zst tap-google-sheets-f6d7c509f00feb63810df7a9216fe1769195197a.zip |
Skip malformed sheets (#13)
* Skip malformed sheets
* Move the try except block up a level for readability sake
* Remove debug code
Diffstat (limited to 'tap_google_sheets/schema.py')
-rw-r--r-- | tap_google_sheets/schema.py | 339 |
1 files changed, 171 insertions, 168 deletions
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py index c229d72..3e63e75 100644 --- a/tap_google_sheets/schema.py +++ b/tap_google_sheets/schema.py | |||
@@ -29,184 +29,183 @@ def get_sheet_schema_columns(sheet): | |||
29 | # Empty sheet, SKIP | 29 | # Empty sheet, SKIP |
30 | LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title)) | 30 | LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title)) |
31 | return None, None | 31 | return None, None |
32 | else: | 32 | # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse |
33 | # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse | 33 | headers = row_data[0].get('values', []) |
34 | headers = row_data[0].get('values', []) | 34 | first_values = row_data[1].get('values', []) |
35 | first_values = row_data[1].get('values', []) | 35 | # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) |
36 | # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) | 36 | |
37 | 37 | sheet_json_schema = { | |
38 | sheet_json_schema = { | 38 | 'type': 'object', |
39 | 'type': 'object', | 39 | 'additionalProperties': False, |
40 | 'additionalProperties': False, | 40 | 'properties': { |
41 | 'properties': { | 41 | '__sdc_spreadsheet_id': { |
42 | '__sdc_spreadsheet_id': { | 42 | 'type': ['null', 'string'] |
43 | 'type': ['null', 'string'] | 43 | }, |
44 | }, | 44 | '__sdc_sheet_id': { |
45 | '__sdc_sheet_id': { | 45 | 'type': ['null', 'integer'] |
46 | 'type': ['null', 'integer'] | 46 | }, |
47 | }, | 47 | '__sdc_row': { |
48 | '__sdc_row': { | 48 | 'type': ['null', 'integer'] |
49 | 'type': ['null', 'integer'] | ||
50 | } | ||
51 | } | 49 | } |
52 | } | 50 | } |
51 | } | ||
52 | |||
53 | header_list = [] # used for checking uniqueness | ||
54 | columns = [] | ||
55 | prior_header = None | ||
56 | i = 0 | ||
57 | skipped = 0 | ||
58 | # Read column headers until end or 2 consecutive skipped headers | ||
59 | for header in headers: | ||
60 | # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True))) | ||
61 | column_index = i + 1 | ||
62 | column_letter = colnum_string(column_index) | ||
63 | header_value = header.get('formattedValue') | ||
64 | if header_value: # NOT skipped | ||
65 | column_is_skipped = False | ||
66 | skipped = 0 | ||
67 | column_name = '{}'.format(header_value) | ||
68 | if column_name in header_list: | ||
69 | raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format( | ||
70 | sheet_title, column_name, column_letter)) | ||
71 | header_list.append(column_name) | ||
53 | 72 | ||
54 | header_list = [] # used for checking uniqueness | 73 | first_value = None |
55 | columns = [] | 74 | try: |
56 | prior_header = None | 75 | first_value = first_values[i] |
57 | i = 0 | 76 | except IndexError as err: |
58 | skipped = 0 | 77 | raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format( |
59 | # Read column headers until end or 2 consecutive skipped headers | 78 | sheet_title, column_name, column_letter, err)) |
60 | for header in headers: | 79 | |
61 | # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True))) | 80 | column_effective_value = first_value.get('effectiveValue', {}) |
62 | column_index = i + 1 | 81 | |
63 | column_letter = colnum_string(column_index) | 82 | col_val = None |
64 | header_value = header.get('formattedValue') | 83 | if column_effective_value == {}: |
65 | if header_value: # NOT skipped | 84 | column_effective_value_type = 'stringValue' |
66 | column_is_skipped = False | 85 | LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format( |
67 | skipped = 0 | 86 | sheet_title, column_name, column_letter)) |
68 | column_name = '{}'.format(header_value) | 87 | LOGGER.info(' Setting column datatype to STRING') |
69 | if column_name in header_list: | 88 | else: |
70 | raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format( | 89 | for key, val in column_effective_value.items(): |
71 | sheet_title, column_name, column_letter)) | 90 | if key in ('numberValue', 'stringValue', 'boolValue'): |
72 | header_list.append(column_name) | 91 | column_effective_value_type = key |
73 | 92 | col_val = str(val) | |
74 | first_value = None | 93 | elif key in ('errorType', 'formulaType'): |
75 | try: | 94 | col_val = str(val) |
76 | first_value = first_values[i] | 95 | raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( |
77 | except IndexError as err: | 96 | sheet_title, column_name, column_letter, key, col_val)) |
78 | raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format( | 97 | |
79 | sheet_title, column_name, column_letter, err)) | 98 | column_number_format = first_values[i].get('effectiveFormat', {}).get( |
80 | 99 | 'numberFormat', {}) | |
81 | column_effective_value = first_value.get('effectiveValue', {}) | 100 | column_number_format_type = column_number_format.get('type') |
82 | 101 | ||
83 | col_val = None | 102 | # Determine datatype for sheet_json_schema |
84 | if column_effective_value == {}: | 103 | # |
85 | column_effective_value_type = 'stringValue' | 104 | # column_effective_value_type = numberValue, stringValue, boolValue; |
86 | LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format( | 105 | # INVALID: errorType, formulaType |
106 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue | ||
107 | # | ||
108 | # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, | ||
109 | # TIME, DATE_TIME, SCIENTIFIC | ||
110 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType | ||
111 | # | ||
112 | column_format = None # Default | ||
113 | if column_effective_value == {}: | ||
114 | col_properties = {'type': ['null', 'string']} | ||
115 | column_gs_type = 'stringValue' | ||
116 | LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format( | ||
87 | sheet_title, column_name, column_letter)) | 117 | sheet_title, column_name, column_letter)) |
88 | LOGGER.info(' Setting column datatype to STRING') | 118 | LOGGER.info(' Setting column datatype to STRING') |
89 | else: | 119 | elif column_effective_value_type == 'stringValue': |
90 | for key, val in column_effective_value.items(): | 120 | col_properties = {'type': ['null', 'string']} |
91 | if key in ('numberValue', 'stringValue', 'boolValue'): | 121 | column_gs_type = 'stringValue' |
92 | column_effective_value_type = key | 122 | elif column_effective_value_type == 'boolValue': |
93 | col_val = str(val) | 123 | col_properties = {'type': ['null', 'boolean', 'string']} |
94 | elif key in ('errorType', 'formulaType'): | 124 | column_gs_type = 'boolValue' |
95 | col_val = str(val) | 125 | elif column_effective_value_type == 'numberValue': |
96 | raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( | 126 | if column_number_format_type == 'DATE_TIME': |
97 | sheet_title, column_name, column_letter, key, col_val)) | 127 | col_properties = { |
98 | 128 | 'type': ['null', 'string'], | |
99 | column_number_format = first_values[i].get('effectiveFormat', {}).get( | 129 | 'format': 'date-time' |
100 | 'numberFormat', {}) | 130 | } |
101 | column_number_format_type = column_number_format.get('type') | 131 | column_gs_type = 'numberType.DATE_TIME' |
102 | 132 | elif column_number_format_type == 'DATE': | |
103 | # Determine datatype for sheet_json_schema | 133 | col_properties = { |
104 | # | 134 | 'type': ['null', 'string'], |
105 | # column_effective_value_type = numberValue, stringValue, boolValue; | 135 | 'format': 'date' |
106 | # INVALID: errorType, formulaType | 136 | } |
107 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue | 137 | column_gs_type = 'numberType.DATE' |
108 | # | 138 | elif column_number_format_type == 'TIME': |
109 | # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, | 139 | col_properties = { |
110 | # TIME, DATE_TIME, SCIENTIFIC | 140 | 'type': ['null', 'string'], |
111 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType | 141 | 'format': 'time' |
112 | # | 142 | } |
113 | column_format = None # Default | 143 | column_gs_type = 'numberType.TIME' |
114 | if column_effective_value == {}: | 144 | elif column_number_format_type == 'TEXT': |
115 | col_properties = {'type': ['null', 'string']} | ||
116 | column_gs_type = 'stringValue' | ||
117 | LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format( | ||
118 | sheet_title, column_name, column_letter)) | ||
119 | LOGGER.info(' Setting column datatype to STRING') | ||
120 | elif column_effective_value_type == 'stringValue': | ||
121 | col_properties = {'type': ['null', 'string']} | 145 | col_properties = {'type': ['null', 'string']} |
122 | column_gs_type = 'stringValue' | 146 | column_gs_type = 'stringValue' |
123 | elif column_effective_value_type == 'boolValue': | ||
124 | col_properties = {'type': ['null', 'boolean', 'string']} | ||
125 | column_gs_type = 'boolValue' | ||
126 | elif column_effective_value_type == 'numberValue': | ||
127 | if column_number_format_type == 'DATE_TIME': | ||
128 | col_properties = { | ||
129 | 'type': ['null', 'string'], | ||
130 | 'format': 'date-time' | ||
131 | } | ||
132 | column_gs_type = 'numberType.DATE_TIME' | ||
133 | elif column_number_format_type == 'DATE': | ||
134 | col_properties = { | ||
135 | 'type': ['null', 'string'], | ||
136 | 'format': 'date' | ||
137 | } | ||
138 | column_gs_type = 'numberType.DATE' | ||
139 | elif column_number_format_type == 'TIME': | ||
140 | col_properties = { | ||
141 | 'type': ['null', 'string'], | ||
142 | 'format': 'time' | ||
143 | } | ||
144 | column_gs_type = 'numberType.TIME' | ||
145 | elif column_number_format_type == 'TEXT': | ||
146 | col_properties = {'type': ['null', 'string']} | ||
147 | column_gs_type = 'stringValue' | ||
148 | else: | ||
149 | # Interesting - order in the anyOf makes a difference. | ||
150 | # Number w/ multipleOf must be listed last, otherwise errors occur. | ||
151 | col_properties = { | ||
152 | 'anyOf': [ | ||
153 | { | ||
154 | 'type': 'null' | ||
155 | }, | ||
156 | { | ||
157 | 'type': 'number', | ||
158 | 'multipleOf': 1e-15 | ||
159 | }, | ||
160 | { | ||
161 | 'type': 'string' | ||
162 | } | ||
163 | ] | ||
164 | } | ||
165 | column_gs_type = 'numberType' | ||
166 | # Catch-all to deal with other types and set to string | ||
167 | # column_effective_value_type: formulaValue, errorValue, or other | ||
168 | else: | 147 | else: |
169 | col_properties = {'type': ['null', 'string']} | 148 | # Interesting - order in the anyOf makes a difference. |
170 | column_gs_type = 'unsupportedValue' | 149 | # Number w/ multipleOf must be listed last, otherwise errors occur. |
171 | LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( | 150 | col_properties = { |
172 | sheet_title, column_name, column_letter, column_effective_value_type, col_val)) | 151 | 'anyOf': [ |
173 | LOGGER.info('Converting to string.') | 152 | { |
174 | else: # skipped | 153 | 'type': 'null' |
175 | column_is_skipped = True | 154 | }, |
176 | skipped = skipped + 1 | 155 | { |
177 | column_index_str = str(column_index).zfill(2) | 156 | 'type': 'number', |
178 | column_name = '__sdc_skip_col_{}'.format(column_index_str) | 157 | 'multipleOf': 1e-15 |
179 | col_properties = {'type': ['null', 'string']} | 158 | }, |
180 | column_gs_type = 'stringValue' | 159 | { |
181 | LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format( | 160 | 'type': 'string' |
182 | sheet_title, column_name, column_letter)) | 161 | } |
183 | LOGGER.info(' This column will be skipped during data loading.') | 162 | ] |
184 | 163 | } | |
185 | if skipped >= 2: | 164 | column_gs_type = 'numberType' |
186 | # skipped = 2 consecutive skipped headers | 165 | # Catch-all to deal with other types and set to string |
187 | # Remove prior_header column_name | 166 | # column_effective_value_type: formulaValue, errorValue, or other |
188 | sheet_json_schema['properties'].pop(prior_header, None) | ||
189 | LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format( | ||
190 | sheet_title, column_name, column_letter)) | ||
191 | break | ||
192 | |||
193 | else: | 167 | else: |
194 | column = {} | 168 | col_properties = {'type': ['null', 'string']} |
195 | column = { | 169 | column_gs_type = 'unsupportedValue' |
196 | 'columnIndex': column_index, | 170 | LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( |
197 | 'columnLetter': column_letter, | 171 | sheet_title, column_name, column_letter, column_effective_value_type, col_val)) |
198 | 'columnName': column_name, | 172 | LOGGER.info('Converting to string.') |
199 | 'columnType': column_gs_type, | 173 | else: # skipped |
200 | 'columnSkipped': column_is_skipped | 174 | column_is_skipped = True |
201 | } | 175 | skipped = skipped + 1 |
202 | columns.append(column) | 176 | column_index_str = str(column_index).zfill(2) |
177 | column_name = '__sdc_skip_col_{}'.format(column_index_str) | ||
178 | col_properties = {'type': ['null', 'string']} | ||
179 | column_gs_type = 'stringValue' | ||
180 | LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format( | ||
181 | sheet_title, column_name, column_letter)) | ||
182 | LOGGER.info(' This column will be skipped during data loading.') | ||
183 | |||
184 | if skipped >= 2: | ||
185 | # skipped = 2 consecutive skipped headers | ||
186 | # Remove prior_header column_name | ||
187 | sheet_json_schema['properties'].pop(prior_header, None) | ||
188 | LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format( | ||
189 | sheet_title, column_name, column_letter)) | ||
190 | break | ||
191 | |||
192 | else: | ||
193 | column = {} | ||
194 | column = { | ||
195 | 'columnIndex': column_index, | ||
196 | 'columnLetter': column_letter, | ||
197 | 'columnName': column_name, | ||
198 | 'columnType': column_gs_type, | ||
199 | 'columnSkipped': column_is_skipped | ||
200 | } | ||
201 | columns.append(column) | ||
203 | 202 | ||
204 | sheet_json_schema['properties'][column_name] = col_properties | 203 | sheet_json_schema['properties'][column_name] = col_properties |
205 | 204 | ||
206 | prior_header = column_name | 205 | prior_header = column_name |
207 | i = i + 1 | 206 | i = i + 1 |
208 | 207 | ||
209 | return sheet_json_schema, columns | 208 | return sheet_json_schema, columns |
210 | 209 | ||
211 | 210 | ||
212 | # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query | 211 | # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query |
@@ -232,7 +231,11 @@ def get_sheet_metadata(sheet, spreadsheet_id, client): | |||
232 | sheet_metadata = sheet_md_results.get('sheets')[0] | 231 | sheet_metadata = sheet_md_results.get('sheets')[0] |
233 | 232 | ||
234 | # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results) | 233 | # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results) |
235 | sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata) | 234 | try: |
235 | sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata) | ||
236 | except: | ||
237 | LOGGER.info('SKIPPING Malformed sheet: {}'.format(sheet_title)) | ||
238 | sheet_json_schema, columns = None, None | ||
236 | 239 | ||
237 | return sheet_json_schema, columns | 240 | return sheet_json_schema, columns |
238 | 241 | ||