aboutsummaryrefslogtreecommitdiffhomepage
path: root/tap_google_sheets/schema.py
diff options
context:
space:
mode:
authorcosimon <cosimon@users.noreply.github.com>2020-04-17 09:09:41 -0400
committerGitHub <noreply@github.com>2020-04-17 09:09:41 -0400
commitf6d7c509f00feb63810df7a9216fe1769195197a (patch)
tree0727c5ccc646e49177db4dec43b58df4c02aa2c2 /tap_google_sheets/schema.py
parentf012a137927591d5d9068d5d499d424ea4112ea9 (diff)
downloadtap-google-sheets-f6d7c509f00feb63810df7a9216fe1769195197a.tar.gz
tap-google-sheets-f6d7c509f00feb63810df7a9216fe1769195197a.tar.zst
tap-google-sheets-f6d7c509f00feb63810df7a9216fe1769195197a.zip
Skip malformed sheets (#13)
* Skip malformed sheets * Move the try except block up a level for readability sake * Remove debug code
Diffstat (limited to 'tap_google_sheets/schema.py')
-rw-r--r--tap_google_sheets/schema.py339
1 files changed, 171 insertions, 168 deletions
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py
index c229d72..3e63e75 100644
--- a/tap_google_sheets/schema.py
+++ b/tap_google_sheets/schema.py
@@ -29,184 +29,183 @@ def get_sheet_schema_columns(sheet):
29 # Empty sheet, SKIP 29 # Empty sheet, SKIP
30 LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title)) 30 LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title))
31 return None, None 31 return None, None
32 else: 32 # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
33 # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse 33 headers = row_data[0].get('values', [])
34 headers = row_data[0].get('values', []) 34 first_values = row_data[1].get('values', [])
35 first_values = row_data[1].get('values', []) 35 # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
36 # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) 36
37 37 sheet_json_schema = {
38 sheet_json_schema = { 38 'type': 'object',
39 'type': 'object', 39 'additionalProperties': False,
40 'additionalProperties': False, 40 'properties': {
41 'properties': { 41 '__sdc_spreadsheet_id': {
42 '__sdc_spreadsheet_id': { 42 'type': ['null', 'string']
43 'type': ['null', 'string'] 43 },
44 }, 44 '__sdc_sheet_id': {
45 '__sdc_sheet_id': { 45 'type': ['null', 'integer']
46 'type': ['null', 'integer'] 46 },
47 }, 47 '__sdc_row': {
48 '__sdc_row': { 48 'type': ['null', 'integer']
49 'type': ['null', 'integer']
50 }
51 } 49 }
52 } 50 }
51 }
52
53 header_list = [] # used for checking uniqueness
54 columns = []
55 prior_header = None
56 i = 0
57 skipped = 0
58 # Read column headers until end or 2 consecutive skipped headers
59 for header in headers:
60 # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
61 column_index = i + 1
62 column_letter = colnum_string(column_index)
63 header_value = header.get('formattedValue')
64 if header_value: # NOT skipped
65 column_is_skipped = False
66 skipped = 0
67 column_name = '{}'.format(header_value)
68 if column_name in header_list:
69 raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
70 sheet_title, column_name, column_letter))
71 header_list.append(column_name)
53 72
54 header_list = [] # used for checking uniqueness 73 first_value = None
55 columns = [] 74 try:
56 prior_header = None 75 first_value = first_values[i]
57 i = 0 76 except IndexError as err:
58 skipped = 0 77 raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
59 # Read column headers until end or 2 consecutive skipped headers 78 sheet_title, column_name, column_letter, err))
60 for header in headers: 79
61 # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True))) 80 column_effective_value = first_value.get('effectiveValue', {})
62 column_index = i + 1 81
63 column_letter = colnum_string(column_index) 82 col_val = None
64 header_value = header.get('formattedValue') 83 if column_effective_value == {}:
65 if header_value: # NOT skipped 84 column_effective_value_type = 'stringValue'
66 column_is_skipped = False 85 LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
67 skipped = 0 86 sheet_title, column_name, column_letter))
68 column_name = '{}'.format(header_value) 87 LOGGER.info(' Setting column datatype to STRING')
69 if column_name in header_list: 88 else:
70 raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format( 89 for key, val in column_effective_value.items():
71 sheet_title, column_name, column_letter)) 90 if key in ('numberValue', 'stringValue', 'boolValue'):
72 header_list.append(column_name) 91 column_effective_value_type = key
73 92 col_val = str(val)
74 first_value = None 93 elif key in ('errorType', 'formulaType'):
75 try: 94 col_val = str(val)
76 first_value = first_values[i] 95 raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
77 except IndexError as err: 96 sheet_title, column_name, column_letter, key, col_val))
78 raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format( 97
79 sheet_title, column_name, column_letter, err)) 98 column_number_format = first_values[i].get('effectiveFormat', {}).get(
80 99 'numberFormat', {})
81 column_effective_value = first_value.get('effectiveValue', {}) 100 column_number_format_type = column_number_format.get('type')
82 101
83 col_val = None 102 # Determine datatype for sheet_json_schema
84 if column_effective_value == {}: 103 #
85 column_effective_value_type = 'stringValue' 104 # column_effective_value_type = numberValue, stringValue, boolValue;
86 LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format( 105 # INVALID: errorType, formulaType
106 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
107 #
108 # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
109 # TIME, DATE_TIME, SCIENTIFIC
110 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
111 #
112 column_format = None # Default
113 if column_effective_value == {}:
114 col_properties = {'type': ['null', 'string']}
115 column_gs_type = 'stringValue'
116 LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
87 sheet_title, column_name, column_letter)) 117 sheet_title, column_name, column_letter))
88 LOGGER.info(' Setting column datatype to STRING') 118 LOGGER.info(' Setting column datatype to STRING')
89 else: 119 elif column_effective_value_type == 'stringValue':
90 for key, val in column_effective_value.items(): 120 col_properties = {'type': ['null', 'string']}
91 if key in ('numberValue', 'stringValue', 'boolValue'): 121 column_gs_type = 'stringValue'
92 column_effective_value_type = key 122 elif column_effective_value_type == 'boolValue':
93 col_val = str(val) 123 col_properties = {'type': ['null', 'boolean', 'string']}
94 elif key in ('errorType', 'formulaType'): 124 column_gs_type = 'boolValue'
95 col_val = str(val) 125 elif column_effective_value_type == 'numberValue':
96 raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( 126 if column_number_format_type == 'DATE_TIME':
97 sheet_title, column_name, column_letter, key, col_val)) 127 col_properties = {
98 128 'type': ['null', 'string'],
99 column_number_format = first_values[i].get('effectiveFormat', {}).get( 129 'format': 'date-time'
100 'numberFormat', {}) 130 }
101 column_number_format_type = column_number_format.get('type') 131 column_gs_type = 'numberType.DATE_TIME'
102 132 elif column_number_format_type == 'DATE':
103 # Determine datatype for sheet_json_schema 133 col_properties = {
104 # 134 'type': ['null', 'string'],
105 # column_effective_value_type = numberValue, stringValue, boolValue; 135 'format': 'date'
106 # INVALID: errorType, formulaType 136 }
107 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue 137 column_gs_type = 'numberType.DATE'
108 # 138 elif column_number_format_type == 'TIME':
109 # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, 139 col_properties = {
110 # TIME, DATE_TIME, SCIENTIFIC 140 'type': ['null', 'string'],
111 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType 141 'format': 'time'
112 # 142 }
113 column_format = None # Default 143 column_gs_type = 'numberType.TIME'
114 if column_effective_value == {}: 144 elif column_number_format_type == 'TEXT':
115 col_properties = {'type': ['null', 'string']}
116 column_gs_type = 'stringValue'
117 LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
118 sheet_title, column_name, column_letter))
119 LOGGER.info(' Setting column datatype to STRING')
120 elif column_effective_value_type == 'stringValue':
121 col_properties = {'type': ['null', 'string']} 145 col_properties = {'type': ['null', 'string']}
122 column_gs_type = 'stringValue' 146 column_gs_type = 'stringValue'
123 elif column_effective_value_type == 'boolValue':
124 col_properties = {'type': ['null', 'boolean', 'string']}
125 column_gs_type = 'boolValue'
126 elif column_effective_value_type == 'numberValue':
127 if column_number_format_type == 'DATE_TIME':
128 col_properties = {
129 'type': ['null', 'string'],
130 'format': 'date-time'
131 }
132 column_gs_type = 'numberType.DATE_TIME'
133 elif column_number_format_type == 'DATE':
134 col_properties = {
135 'type': ['null', 'string'],
136 'format': 'date'
137 }
138 column_gs_type = 'numberType.DATE'
139 elif column_number_format_type == 'TIME':
140 col_properties = {
141 'type': ['null', 'string'],
142 'format': 'time'
143 }
144 column_gs_type = 'numberType.TIME'
145 elif column_number_format_type == 'TEXT':
146 col_properties = {'type': ['null', 'string']}
147 column_gs_type = 'stringValue'
148 else:
149 # Interesting - order in the anyOf makes a difference.
150 # Number w/ multipleOf must be listed last, otherwise errors occur.
151 col_properties = {
152 'anyOf': [
153 {
154 'type': 'null'
155 },
156 {
157 'type': 'number',
158 'multipleOf': 1e-15
159 },
160 {
161 'type': 'string'
162 }
163 ]
164 }
165 column_gs_type = 'numberType'
166 # Catch-all to deal with other types and set to string
167 # column_effective_value_type: formulaValue, errorValue, or other
168 else: 147 else:
169 col_properties = {'type': ['null', 'string']} 148 # Interesting - order in the anyOf makes a difference.
170 column_gs_type = 'unsupportedValue' 149 # Number w/ multipleOf must be listed last, otherwise errors occur.
171 LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( 150 col_properties = {
172 sheet_title, column_name, column_letter, column_effective_value_type, col_val)) 151 'anyOf': [
173 LOGGER.info('Converting to string.') 152 {
174 else: # skipped 153 'type': 'null'
175 column_is_skipped = True 154 },
176 skipped = skipped + 1 155 {
177 column_index_str = str(column_index).zfill(2) 156 'type': 'number',
178 column_name = '__sdc_skip_col_{}'.format(column_index_str) 157 'multipleOf': 1e-15
179 col_properties = {'type': ['null', 'string']} 158 },
180 column_gs_type = 'stringValue' 159 {
181 LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format( 160 'type': 'string'
182 sheet_title, column_name, column_letter)) 161 }
183 LOGGER.info(' This column will be skipped during data loading.') 162 ]
184 163 }
185 if skipped >= 2: 164 column_gs_type = 'numberType'
186 # skipped = 2 consecutive skipped headers 165 # Catch-all to deal with other types and set to string
187 # Remove prior_header column_name 166 # column_effective_value_type: formulaValue, errorValue, or other
188 sheet_json_schema['properties'].pop(prior_header, None)
189 LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
190 sheet_title, column_name, column_letter))
191 break
192
193 else: 167 else:
194 column = {} 168 col_properties = {'type': ['null', 'string']}
195 column = { 169 column_gs_type = 'unsupportedValue'
196 'columnIndex': column_index, 170 LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
197 'columnLetter': column_letter, 171 sheet_title, column_name, column_letter, column_effective_value_type, col_val))
198 'columnName': column_name, 172 LOGGER.info('Converting to string.')
199 'columnType': column_gs_type, 173 else: # skipped
200 'columnSkipped': column_is_skipped 174 column_is_skipped = True
201 } 175 skipped = skipped + 1
202 columns.append(column) 176 column_index_str = str(column_index).zfill(2)
177 column_name = '__sdc_skip_col_{}'.format(column_index_str)
178 col_properties = {'type': ['null', 'string']}
179 column_gs_type = 'stringValue'
180 LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
181 sheet_title, column_name, column_letter))
182 LOGGER.info(' This column will be skipped during data loading.')
183
184 if skipped >= 2:
185 # skipped = 2 consecutive skipped headers
186 # Remove prior_header column_name
187 sheet_json_schema['properties'].pop(prior_header, None)
188 LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
189 sheet_title, column_name, column_letter))
190 break
191
192 else:
193 column = {}
194 column = {
195 'columnIndex': column_index,
196 'columnLetter': column_letter,
197 'columnName': column_name,
198 'columnType': column_gs_type,
199 'columnSkipped': column_is_skipped
200 }
201 columns.append(column)
203 202
204 sheet_json_schema['properties'][column_name] = col_properties 203 sheet_json_schema['properties'][column_name] = col_properties
205 204
206 prior_header = column_name 205 prior_header = column_name
207 i = i + 1 206 i = i + 1
208 207
209 return sheet_json_schema, columns 208 return sheet_json_schema, columns
210 209
211 210
212# Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query 211# Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query
@@ -232,7 +231,11 @@ def get_sheet_metadata(sheet, spreadsheet_id, client):
232 sheet_metadata = sheet_md_results.get('sheets')[0] 231 sheet_metadata = sheet_md_results.get('sheets')[0]
233 232
234 # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results) 233 # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results)
235 sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata) 234 try:
235 sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata)
236 except:
237 LOGGER.info('SKIPPING Malformed sheet: {}'.format(sheet_title))
238 sheet_json_schema, columns = None, None
236 239
237 return sheet_json_schema, columns 240 return sheet_json_schema, columns
238 241