diff options
Diffstat (limited to 'tap_google_sheets/schema.py')
-rw-r--r-- | tap_google_sheets/schema.py | 51 |
1 files changed, 41 insertions, 10 deletions
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py index 243467b..e319c03 100644 --- a/tap_google_sheets/schema.py +++ b/tap_google_sheets/schema.py | |||
@@ -21,6 +21,7 @@ def colnum_string(num): | |||
21 | 21 | ||
22 | # Create sheet_metadata_json with columns from sheet | 22 | # Create sheet_metadata_json with columns from sheet |
23 | def get_sheet_schema_columns(sheet): | 23 | def get_sheet_schema_columns(sheet): |
24 | sheet_title = sheet.get('properties', {}).get('title') | ||
24 | sheet_json_schema = OrderedDict() | 25 | sheet_json_schema = OrderedDict() |
25 | data = next(iter(sheet.get('data', [])), {}) | 26 | data = next(iter(sheet.get('data', [])), {}) |
26 | row_data = data.get('rowData', []) | 27 | row_data = data.get('rowData', []) |
@@ -62,15 +63,34 @@ def get_sheet_schema_columns(sheet): | |||
62 | skipped = 0 | 63 | skipped = 0 |
63 | column_name = '{}'.format(header_value) | 64 | column_name = '{}'.format(header_value) |
64 | if column_name in header_list: | 65 | if column_name in header_list: |
65 | raise Exception('DUPLICATE HEADER ERROR: {}'.format(column_name)) | 66 | raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format( |
67 | sheet_title, column_name, column_letter)) | ||
66 | header_list.append(column_name) | 68 | header_list.append(column_name) |
67 | 69 | ||
68 | first_value = first_values[i] | 70 | first_value = None |
69 | 71 | try: | |
72 | first_value = first_values[i] | ||
73 | except IndexError as err: | ||
74 | raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format( | ||
75 | sheet_title, column_name, column_letter, err)) | ||
76 | |||
70 | column_effective_value = first_value.get('effectiveValue', {}) | 77 | column_effective_value = first_value.get('effectiveValue', {}) |
71 | for key in column_effective_value.keys(): | 78 | |
72 | if key in ('numberValue', 'stringValue', 'boolValue', 'errorType', 'formulaType'): | 79 | col_val = None |
73 | column_effective_value_type = key | 80 | if column_effective_value == {}: |
81 | column_effective_value_type = 'stringValue' | ||
82 | LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format( | ||
83 | sheet_title, column_name, column_letter)) | ||
84 | LOGGER.info(' Setting column datatype to STRING') | ||
85 | else: | ||
86 | for key, val in column_effective_value.items(): | ||
87 | if key in ('numberValue', 'stringValue', 'boolValue'): | ||
88 | column_effective_value_type = key | ||
89 | col_val = str(val) | ||
90 | elif key in ('errorType', 'formulaType'): | ||
91 | col_val = str(val) | ||
92 | raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( | ||
93 | sheet_title, column_name, column_letter, key, col_val)) | ||
74 | 94 | ||
75 | column_number_format = first_values[i].get('effectiveFormat', {}).get( | 95 | column_number_format = first_values[i].get('effectiveFormat', {}).get( |
76 | 'numberFormat', {}) | 96 | 'numberFormat', {}) |
@@ -87,7 +107,13 @@ def get_sheet_schema_columns(sheet): | |||
87 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType | 107 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType |
88 | # | 108 | # |
89 | column_format = None # Default | 109 | column_format = None # Default |
90 | if column_effective_value_type == 'stringValue': | 110 | if column_effective_value == {}: |
111 | col_properties = {'type': ['null', 'string']} | ||
112 | column_gs_type = 'stringValue' | ||
113 | LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format( | ||
114 | sheet_title, column_name, column_letter)) | ||
115 | LOGGER.info(' Setting column datatype to STRING') | ||
116 | elif column_effective_value_type == 'stringValue': | ||
91 | col_properties = {'type': ['null', 'string']} | 117 | col_properties = {'type': ['null', 'string']} |
92 | column_gs_type = 'stringValue' | 118 | column_gs_type = 'stringValue' |
93 | elif column_effective_value_type == 'boolValue': | 119 | elif column_effective_value_type == 'boolValue': |
@@ -138,8 +164,8 @@ def get_sheet_schema_columns(sheet): | |||
138 | else: | 164 | else: |
139 | col_properties = {'type': ['null', 'string']} | 165 | col_properties = {'type': ['null', 'string']} |
140 | column_gs_type = 'unsupportedValue' | 166 | column_gs_type = 'unsupportedValue' |
141 | LOGGER.info('Unsupported data type: {}, value: {}'.format(column_name, \ | 167 | LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( |
142 | column_effective_value_type)) | 168 | sheet_title, column_name, column_letter, column_effective_value_type, col_val)) |
143 | LOGGER.info('Converting to string.') | 169 | LOGGER.info('Converting to string.') |
144 | else: # skipped | 170 | else: # skipped |
145 | column_is_skipped = True | 171 | column_is_skipped = True |
@@ -148,11 +174,16 @@ def get_sheet_schema_columns(sheet): | |||
148 | column_name = '__sdc_skip_col_{}'.format(column_index_str) | 174 | column_name = '__sdc_skip_col_{}'.format(column_index_str) |
149 | col_properties = {'type': ['null', 'string']} | 175 | col_properties = {'type': ['null', 'string']} |
150 | column_gs_type = 'stringValue' | 176 | column_gs_type = 'stringValue' |
177 | LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format( | ||
178 | sheet_title, column_name, column_letter)) | ||
179 | LOGGER.info(' This column will be skipped during data loading.') | ||
151 | 180 | ||
152 | if skipped >= 2: | 181 | if skipped >= 2: |
153 | # skipped = 2 consecutive skipped headers | 182 | # skipped = 2 consecutive skipped headers |
154 | # Remove prior_header column_name | 183 | # Remove prior_header column_name |
155 | sheet_json_schema['properties'].pop(prior_header, None) | 184 | sheet_json_schema['properties'].pop(prior_header, None) |
185 | LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format( | ||
186 | sheet_title, column_name, column_letter)) | ||
156 | break | 187 | break |
157 | 188 | ||
158 | else: | 189 | else: |
@@ -245,7 +276,7 @@ def get_schemas(client, spreadsheet_id): | |||
245 | for sheet in sheets: | 276 | for sheet in sheets: |
246 | # GET sheet_json_schema for each worksheet (from function above) | 277 | # GET sheet_json_schema for each worksheet (from function above) |
247 | sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) | 278 | sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) |
248 | LOGGER.info('columns = {}'.format(columns)) | 279 | # LOGGER.info('columns = {}'.format(columns)) |
249 | 280 | ||
250 | sheet_title = sheet.get('properties', {}).get('title') | 281 | sheet_title = sheet.get('properties', {}).get('title') |
251 | schemas[sheet_title] = sheet_json_schema | 282 | schemas[sheet_title] = sheet_json_schema |