aboutsummaryrefslogtreecommitdiffhomepage
path: root/tap_google_sheets/schema.py
diff options
context:
space:
mode:
authorJeff Huth <39202799+jeffhuth-bytecode@users.noreply.github.com>2020-01-09 07:30:53 -0800
committerKyle Allan <KAllan357@gmail.com>2020-01-09 10:30:53 -0500
commit43a24cbab1dbc35b893c35b86e34adc0f2fb84e7 (patch)
treebcbaae860aad0a94bcc4d27f4804504691401438 /tap_google_sheets/schema.py
parent5890b89c1aa7c554235b3cef156b5a5a2c594bec (diff)
downloadtap-google-sheets-43a24cbab1dbc35b893c35b86e34adc0f2fb84e7.tar.gz
tap-google-sheets-43a24cbab1dbc35b893c35b86e34adc0f2fb84e7.tar.zst
tap-google-sheets-43a24cbab1dbc35b893c35b86e34adc0f2fb84e7.zip
v.0.0.3 Sync error handling, activate version, documentation (#2)v0.0.3
* v.0.0.2 schema and sync changes Change number json schema to anyOf with multipleOf; skip empty rows; move write_bookmark to end of sync.py * v.0.0.3 Sync activate version and error handling Update README.md documentation. Improved logging and handling of errors and warnings. Better null handling in Discovery and Sync. Fix issues with activate version messages.
Diffstat (limited to 'tap_google_sheets/schema.py')
-rw-r--r--tap_google_sheets/schema.py51
1 files changed, 41 insertions, 10 deletions
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py
index 243467b..e319c03 100644
--- a/tap_google_sheets/schema.py
+++ b/tap_google_sheets/schema.py
@@ -21,6 +21,7 @@ def colnum_string(num):
21 21
22# Create sheet_metadata_json with columns from sheet 22# Create sheet_metadata_json with columns from sheet
23def get_sheet_schema_columns(sheet): 23def get_sheet_schema_columns(sheet):
24 sheet_title = sheet.get('properties', {}).get('title')
24 sheet_json_schema = OrderedDict() 25 sheet_json_schema = OrderedDict()
25 data = next(iter(sheet.get('data', [])), {}) 26 data = next(iter(sheet.get('data', [])), {})
26 row_data = data.get('rowData', []) 27 row_data = data.get('rowData', [])
@@ -62,15 +63,34 @@ def get_sheet_schema_columns(sheet):
62 skipped = 0 63 skipped = 0
63 column_name = '{}'.format(header_value) 64 column_name = '{}'.format(header_value)
64 if column_name in header_list: 65 if column_name in header_list:
65 raise Exception('DUPLICATE HEADER ERROR: {}'.format(column_name)) 66 raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
67 sheet_title, column_name, column_letter))
66 header_list.append(column_name) 68 header_list.append(column_name)
67 69
68 first_value = first_values[i] 70 first_value = None
69 71 try:
72 first_value = first_values[i]
73 except IndexError as err:
74 raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
75 sheet_title, column_name, column_letter, err))
76
70 column_effective_value = first_value.get('effectiveValue', {}) 77 column_effective_value = first_value.get('effectiveValue', {})
71 for key in column_effective_value.keys(): 78
72 if key in ('numberValue', 'stringValue', 'boolValue', 'errorType', 'formulaType'): 79 col_val = None
73 column_effective_value_type = key 80 if column_effective_value == {}:
81 column_effective_value_type = 'stringValue'
82 LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
83 sheet_title, column_name, column_letter))
84 LOGGER.info(' Setting column datatype to STRING')
85 else:
86 for key, val in column_effective_value.items():
87 if key in ('numberValue', 'stringValue', 'boolValue'):
88 column_effective_value_type = key
89 col_val = str(val)
90 elif key in ('errorType', 'formulaType'):
91 col_val = str(val)
92 raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
93 sheet_title, column_name, column_letter, key, col_val))
74 94
75 column_number_format = first_values[i].get('effectiveFormat', {}).get( 95 column_number_format = first_values[i].get('effectiveFormat', {}).get(
76 'numberFormat', {}) 96 'numberFormat', {})
@@ -87,7 +107,13 @@ def get_sheet_schema_columns(sheet):
87 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType 107 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
88 # 108 #
89 column_format = None # Default 109 column_format = None # Default
90 if column_effective_value_type == 'stringValue': 110 if column_effective_value == {}:
111 col_properties = {'type': ['null', 'string']}
112 column_gs_type = 'stringValue'
113 LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
114 sheet_title, column_name, column_letter))
115 LOGGER.info(' Setting column datatype to STRING')
116 elif column_effective_value_type == 'stringValue':
91 col_properties = {'type': ['null', 'string']} 117 col_properties = {'type': ['null', 'string']}
92 column_gs_type = 'stringValue' 118 column_gs_type = 'stringValue'
93 elif column_effective_value_type == 'boolValue': 119 elif column_effective_value_type == 'boolValue':
@@ -138,8 +164,8 @@ def get_sheet_schema_columns(sheet):
138 else: 164 else:
139 col_properties = {'type': ['null', 'string']} 165 col_properties = {'type': ['null', 'string']}
140 column_gs_type = 'unsupportedValue' 166 column_gs_type = 'unsupportedValue'
141 LOGGER.info('Unsupported data type: {}, value: {}'.format(column_name, \ 167 LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
142 column_effective_value_type)) 168 sheet_title, column_name, column_letter, column_effective_value_type, col_val))
143 LOGGER.info('Converting to string.') 169 LOGGER.info('Converting to string.')
144 else: # skipped 170 else: # skipped
145 column_is_skipped = True 171 column_is_skipped = True
@@ -148,11 +174,16 @@ def get_sheet_schema_columns(sheet):
148 column_name = '__sdc_skip_col_{}'.format(column_index_str) 174 column_name = '__sdc_skip_col_{}'.format(column_index_str)
149 col_properties = {'type': ['null', 'string']} 175 col_properties = {'type': ['null', 'string']}
150 column_gs_type = 'stringValue' 176 column_gs_type = 'stringValue'
177 LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
178 sheet_title, column_name, column_letter))
179 LOGGER.info(' This column will be skipped during data loading.')
151 180
152 if skipped >= 2: 181 if skipped >= 2:
153 # skipped = 2 consecutive skipped headers 182 # skipped = 2 consecutive skipped headers
154 # Remove prior_header column_name 183 # Remove prior_header column_name
155 sheet_json_schema['properties'].pop(prior_header, None) 184 sheet_json_schema['properties'].pop(prior_header, None)
185 LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
186 sheet_title, column_name, column_letter))
156 break 187 break
157 188
158 else: 189 else:
@@ -245,7 +276,7 @@ def get_schemas(client, spreadsheet_id):
245 for sheet in sheets: 276 for sheet in sheets:
246 # GET sheet_json_schema for each worksheet (from function above) 277 # GET sheet_json_schema for each worksheet (from function above)
247 sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) 278 sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
248 LOGGER.info('columns = {}'.format(columns)) 279 # LOGGER.info('columns = {}'.format(columns))
249 280
250 sheet_title = sheet.get('properties', {}).get('title') 281 sheet_title = sheet.get('properties', {}).get('title')
251 schemas[sheet_title] = sheet_json_schema 282 schemas[sheet_title] = sheet_json_schema