aboutsummaryrefslogtreecommitdiffhomepage
path: root/tap_google_sheets/schema.py
diff options
context:
space:
mode:
authorJeff Huth <39202799+jeffhuth-bytecode@users.noreply.github.com>2020-02-24 09:53:26 -0800
committerGitHub <noreply@github.com>2020-02-24 12:53:26 -0500
commit376f1145837541d4fff2ad0e499236761f9873c3 (patch)
treecc086f18b24bda8a86c16c3ec742b89947f382ae /tap_google_sheets/schema.py
parentf1d1d43c6b74a8705e91e908c582e39c68464c0c (diff)
downloadtap-google-sheets-376f1145837541d4fff2ad0e499236761f9873c3.tar.gz
tap-google-sheets-376f1145837541d4fff2ad0e499236761f9873c3.tar.zst
tap-google-sheets-376f1145837541d4fff2ad0e499236761f9873c3.zip
v.0.0.4 Logic to skip empty sheets (#4)v0.0.4
* v.0.0.2 schema and sync changes Change number json schema to anyOf with multipleOf; skip empty rows; move write_bookmark to end of sync.py * v.0.0.3 Sync activate version and error handling Update README.md documentation. Improved logging and handling of errors and warnings. Better null handling in Discovery and Sync. Fix issues with activate version messages. * v.0.0.4 Skip empty worksheets Add logic to skip empty worksheets in Discovery and Sync mode. * schema.py fix number datatype issue Nomber datatypes are being created as strings in targets. The JSON schema order needs to be adjusted so that order is null, number, string.
Diffstat (limited to 'tap_google_sheets/schema.py')
-rw-r--r--tap_google_sheets/schema.py365
1 files changed, 185 insertions, 180 deletions
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py
index e319c03..c229d72 100644
--- a/tap_google_sheets/schema.py
+++ b/tap_google_sheets/schema.py
@@ -25,184 +25,188 @@ def get_sheet_schema_columns(sheet):
25 sheet_json_schema = OrderedDict() 25 sheet_json_schema = OrderedDict()
26 data = next(iter(sheet.get('data', [])), {}) 26 data = next(iter(sheet.get('data', [])), {})
27 row_data = data.get('rowData', []) 27 row_data = data.get('rowData', [])
28 # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse 28 if row_data == []:
29 29 # Empty sheet, SKIP
30 headers = row_data[0].get('values', []) 30 LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title))
31 first_values = row_data[1].get('values', []) 31 return None, None
32 # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) 32 else:
33 33 # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
34 sheet_json_schema = { 34 headers = row_data[0].get('values', [])
35 'type': 'object', 35 first_values = row_data[1].get('values', [])
36 'additionalProperties': False, 36 # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
37 'properties': { 37
38 '__sdc_spreadsheet_id': { 38 sheet_json_schema = {
39 'type': ['null', 'string'] 39 'type': 'object',
40 }, 40 'additionalProperties': False,
41 '__sdc_sheet_id': { 41 'properties': {
42 'type': ['null', 'integer'] 42 '__sdc_spreadsheet_id': {
43 }, 43 'type': ['null', 'string']
44 '__sdc_row': { 44 },
45 'type': ['null', 'integer'] 45 '__sdc_sheet_id': {
46 'type': ['null', 'integer']
47 },
48 '__sdc_row': {
49 'type': ['null', 'integer']
50 }
46 } 51 }
47 } 52 }
48 } 53
49 54 header_list = [] # used for checking uniqueness
50 header_list = [] # used for checking uniqueness 55 columns = []
51 columns = [] 56 prior_header = None
52 prior_header = None 57 i = 0
53 i = 0 58 skipped = 0
54 skipped = 0 59 # Read column headers until end or 2 consecutive skipped headers
55 # Read column headers until end or 2 consecutive skipped headers 60 for header in headers:
56 for header in headers: 61 # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
57 # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True))) 62 column_index = i + 1
58 column_index = i + 1 63 column_letter = colnum_string(column_index)
59 column_letter = colnum_string(column_index) 64 header_value = header.get('formattedValue')
60 header_value = header.get('formattedValue') 65 if header_value: # NOT skipped
61 if header_value: # NOT skipped 66 column_is_skipped = False
62 column_is_skipped = False 67 skipped = 0
63 skipped = 0 68 column_name = '{}'.format(header_value)
64 column_name = '{}'.format(header_value) 69 if column_name in header_list:
65 if column_name in header_list: 70 raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
66 raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
67 sheet_title, column_name, column_letter))
68 header_list.append(column_name)
69
70 first_value = None
71 try:
72 first_value = first_values[i]
73 except IndexError as err:
74 raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
75 sheet_title, column_name, column_letter, err))
76
77 column_effective_value = first_value.get('effectiveValue', {})
78
79 col_val = None
80 if column_effective_value == {}:
81 column_effective_value_type = 'stringValue'
82 LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
83 sheet_title, column_name, column_letter))
84 LOGGER.info(' Setting column datatype to STRING')
85 else:
86 for key, val in column_effective_value.items():
87 if key in ('numberValue', 'stringValue', 'boolValue'):
88 column_effective_value_type = key
89 col_val = str(val)
90 elif key in ('errorType', 'formulaType'):
91 col_val = str(val)
92 raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
93 sheet_title, column_name, column_letter, key, col_val))
94
95 column_number_format = first_values[i].get('effectiveFormat', {}).get(
96 'numberFormat', {})
97 column_number_format_type = column_number_format.get('type')
98
99 # Determine datatype for sheet_json_schema
100 #
101 # column_effective_value_type = numberValue, stringValue, boolValue;
102 # INVALID: errorType, formulaType
103 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
104 #
105 # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
106 # TIME, DATE_TIME, SCIENTIFIC
107 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
108 #
109 column_format = None # Default
110 if column_effective_value == {}:
111 col_properties = {'type': ['null', 'string']}
112 column_gs_type = 'stringValue'
113 LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
114 sheet_title, column_name, column_letter)) 71 sheet_title, column_name, column_letter))
115 LOGGER.info(' Setting column datatype to STRING') 72 header_list.append(column_name)
116 elif column_effective_value_type == 'stringValue': 73
117 col_properties = {'type': ['null', 'string']} 74 first_value = None
118 column_gs_type = 'stringValue' 75 try:
119 elif column_effective_value_type == 'boolValue': 76 first_value = first_values[i]
120 col_properties = {'type': ['null', 'boolean', 'string']} 77 except IndexError as err:
121 column_gs_type = 'boolValue' 78 raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
122 elif column_effective_value_type == 'numberValue': 79 sheet_title, column_name, column_letter, err))
123 if column_number_format_type == 'DATE_TIME': 80
124 col_properties = { 81 column_effective_value = first_value.get('effectiveValue', {})
125 'type': ['null', 'string'], 82
126 'format': 'date-time' 83 col_val = None
127 } 84 if column_effective_value == {}:
128 column_gs_type = 'numberType.DATE_TIME' 85 column_effective_value_type = 'stringValue'
129 elif column_number_format_type == 'DATE': 86 LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
130 col_properties = { 87 sheet_title, column_name, column_letter))
131 'type': ['null', 'string'], 88 LOGGER.info(' Setting column datatype to STRING')
132 'format': 'date' 89 else:
133 } 90 for key, val in column_effective_value.items():
134 column_gs_type = 'numberType.DATE' 91 if key in ('numberValue', 'stringValue', 'boolValue'):
135 elif column_number_format_type == 'TIME': 92 column_effective_value_type = key
136 col_properties = { 93 col_val = str(val)
137 'type': ['null', 'string'], 94 elif key in ('errorType', 'formulaType'):
138 'format': 'time' 95 col_val = str(val)
139 } 96 raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
140 column_gs_type = 'numberType.TIME' 97 sheet_title, column_name, column_letter, key, col_val))
141 elif column_number_format_type == 'TEXT': 98
99 column_number_format = first_values[i].get('effectiveFormat', {}).get(
100 'numberFormat', {})
101 column_number_format_type = column_number_format.get('type')
102
103 # Determine datatype for sheet_json_schema
104 #
105 # column_effective_value_type = numberValue, stringValue, boolValue;
106 # INVALID: errorType, formulaType
107 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
108 #
109 # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
110 # TIME, DATE_TIME, SCIENTIFIC
111 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
112 #
113 column_format = None # Default
114 if column_effective_value == {}:
115 col_properties = {'type': ['null', 'string']}
116 column_gs_type = 'stringValue'
117 LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
118 sheet_title, column_name, column_letter))
119 LOGGER.info(' Setting column datatype to STRING')
120 elif column_effective_value_type == 'stringValue':
142 col_properties = {'type': ['null', 'string']} 121 col_properties = {'type': ['null', 'string']}
143 column_gs_type = 'stringValue' 122 column_gs_type = 'stringValue'
123 elif column_effective_value_type == 'boolValue':
124 col_properties = {'type': ['null', 'boolean', 'string']}
125 column_gs_type = 'boolValue'
126 elif column_effective_value_type == 'numberValue':
127 if column_number_format_type == 'DATE_TIME':
128 col_properties = {
129 'type': ['null', 'string'],
130 'format': 'date-time'
131 }
132 column_gs_type = 'numberType.DATE_TIME'
133 elif column_number_format_type == 'DATE':
134 col_properties = {
135 'type': ['null', 'string'],
136 'format': 'date'
137 }
138 column_gs_type = 'numberType.DATE'
139 elif column_number_format_type == 'TIME':
140 col_properties = {
141 'type': ['null', 'string'],
142 'format': 'time'
143 }
144 column_gs_type = 'numberType.TIME'
145 elif column_number_format_type == 'TEXT':
146 col_properties = {'type': ['null', 'string']}
147 column_gs_type = 'stringValue'
148 else:
149 # Interesting - order in the anyOf makes a difference.
150 # Number w/ multipleOf must be listed last, otherwise errors occur.
151 col_properties = {
152 'anyOf': [
153 {
154 'type': 'null'
155 },
156 {
157 'type': 'number',
158 'multipleOf': 1e-15
159 },
160 {
161 'type': 'string'
162 }
163 ]
164 }
165 column_gs_type = 'numberType'
166 # Catch-all to deal with other types and set to string
167 # column_effective_value_type: formulaValue, errorValue, or other
144 else: 168 else:
145 # Interesting - order in the anyOf makes a difference. 169 col_properties = {'type': ['null', 'string']}
146 # Number w/ multipleOf must be listed last, otherwise errors occur. 170 column_gs_type = 'unsupportedValue'
147 col_properties = { 171 LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
148 'anyOf': [ 172 sheet_title, column_name, column_letter, column_effective_value_type, col_val))
149 { 173 LOGGER.info('Converting to string.')
150 'type': 'string' 174 else: # skipped
151 }, 175 column_is_skipped = True
152 { 176 skipped = skipped + 1
153 'type': 'null' 177 column_index_str = str(column_index).zfill(2)
154 }, 178 column_name = '__sdc_skip_col_{}'.format(column_index_str)
155 {
156 'type': 'number',
157 'multipleOf': 1e-15
158 }
159 ]
160 }
161 column_gs_type = 'numberType'
162 # Catch-all to deal with other types and set to string
163 # column_effective_value_type: formulaValue, errorValue, or other
164 else:
165 col_properties = {'type': ['null', 'string']} 179 col_properties = {'type': ['null', 'string']}
166 column_gs_type = 'unsupportedValue' 180 column_gs_type = 'stringValue'
167 LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( 181 LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
168 sheet_title, column_name, column_letter, column_effective_value_type, col_val)) 182 sheet_title, column_name, column_letter))
169 LOGGER.info('Converting to string.') 183 LOGGER.info(' This column will be skipped during data loading.')
170 else: # skipped
171 column_is_skipped = True
172 skipped = skipped + 1
173 column_index_str = str(column_index).zfill(2)
174 column_name = '__sdc_skip_col_{}'.format(column_index_str)
175 col_properties = {'type': ['null', 'string']}
176 column_gs_type = 'stringValue'
177 LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
178 sheet_title, column_name, column_letter))
179 LOGGER.info(' This column will be skipped during data loading.')
180
181 if skipped >= 2:
182 # skipped = 2 consecutive skipped headers
183 # Remove prior_header column_name
184 sheet_json_schema['properties'].pop(prior_header, None)
185 LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
186 sheet_title, column_name, column_letter))
187 break
188
189 else:
190 column = {}
191 column = {
192 'columnIndex': column_index,
193 'columnLetter': column_letter,
194 'columnName': column_name,
195 'columnType': column_gs_type,
196 'columnSkipped': column_is_skipped
197 }
198 columns.append(column)
199 184
200 sheet_json_schema['properties'][column_name] = col_properties 185 if skipped >= 2:
186 # skipped = 2 consecutive skipped headers
187 # Remove prior_header column_name
188 sheet_json_schema['properties'].pop(prior_header, None)
189 LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
190 sheet_title, column_name, column_letter))
191 break
201 192
202 prior_header = column_name 193 else:
203 i = i + 1 194 column = {}
195 column = {
196 'columnIndex': column_index,
197 'columnLetter': column_letter,
198 'columnName': column_name,
199 'columnType': column_gs_type,
200 'columnSkipped': column_is_skipped
201 }
202 columns.append(column)
204 203
205 return sheet_json_schema, columns 204 sheet_json_schema['properties'][column_name] = col_properties
205
206 prior_header = column_name
207 i = i + 1
208
209 return sheet_json_schema, columns
206 210
207 211
208# Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query 212# Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query
@@ -276,17 +280,18 @@ def get_schemas(client, spreadsheet_id):
276 for sheet in sheets: 280 for sheet in sheets:
277 # GET sheet_json_schema for each worksheet (from function above) 281 # GET sheet_json_schema for each worksheet (from function above)
278 sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) 282 sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
279 # LOGGER.info('columns = {}'.format(columns)) 283
280 284 # SKIP empty sheets (where sheet_json_schema and columns are None)
281 sheet_title = sheet.get('properties', {}).get('title') 285 if sheet_json_schema and columns:
282 schemas[sheet_title] = sheet_json_schema 286 sheet_title = sheet.get('properties', {}).get('title')
283 sheet_mdata = metadata.new() 287 schemas[sheet_title] = sheet_json_schema
284 sheet_mdata = metadata.get_standard_metadata( 288 sheet_mdata = metadata.new()
285 schema=sheet_json_schema, 289 sheet_mdata = metadata.get_standard_metadata(
286 key_properties=['__sdc_row'], 290 schema=sheet_json_schema,
287 valid_replication_keys=None, 291 key_properties=['__sdc_row'],
288 replication_method='FULL_TABLE' 292 valid_replication_keys=None,
289 ) 293 replication_method='FULL_TABLE'
290 field_metadata[sheet_title] = sheet_mdata 294 )
295 field_metadata[sheet_title] = sheet_mdata
291 296
292 return schemas, field_metadata 297 return schemas, field_metadata