aboutsummaryrefslogtreecommitdiffhomepage
path: root/tap_google_sheets/schema.py
diff options
context:
space:
mode:
authorJeff Huth <39202799+jeffhuth-bytecode@users.noreply.github.com>2019-12-04 06:10:46 -0800
committerKyle Allan <KAllan357@gmail.com>2019-12-04 09:10:46 -0500
commit5890b89c1aa7c554235b3cef156b5a5a2c594bec (patch)
tree2f553cda853991aedb1cec68dc6d06b87d6fe190 /tap_google_sheets/schema.py
parent075af7096d3c1b369702feba4076c25b954732dc (diff)
downloadtap-google-sheets-5890b89c1aa7c554235b3cef156b5a5a2c594bec.tar.gz
tap-google-sheets-5890b89c1aa7c554235b3cef156b5a5a2c594bec.tar.zst
tap-google-sheets-5890b89c1aa7c554235b3cef156b5a5a2c594bec.zip
v.0.0.2 schema and sync changes (#1)v0.0.2
Change number json schema to anyOf with multipleOf; skip empty rows; move write_bookmark to end of sync.py
Diffstat (limited to 'tap_google_sheets/schema.py')
-rw-r--r--tap_google_sheets/schema.py83
1 files changed, 56 insertions, 27 deletions
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py
index d4fead5..243467b 100644
--- a/tap_google_sheets/schema.py
+++ b/tap_google_sheets/schema.py
@@ -30,8 +30,6 @@ def get_sheet_schema_columns(sheet):
30 first_values = row_data[1].get('values', []) 30 first_values = row_data[1].get('values', [])
31 # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) 31 # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
32 32
33 sheet_json_schema['type'] = 'object'
34 sheet_json_schema['additionalProperties'] = False
35 sheet_json_schema = { 33 sheet_json_schema = {
36 'type': 'object', 34 'type': 'object',
37 'additionalProperties': False, 35 'additionalProperties': False,
@@ -89,42 +87,66 @@ def get_sheet_schema_columns(sheet):
89 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType 87 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
90 # 88 #
91 column_format = None # Default 89 column_format = None # Default
92 # column_multiple_of = None # Default
93 if column_effective_value_type == 'stringValue': 90 if column_effective_value_type == 'stringValue':
94 column_type = ['null', 'string'] 91 col_properties = {'type': ['null', 'string']}
95 column_gs_type = 'stringValue' 92 column_gs_type = 'stringValue'
96 elif column_effective_value_type == 'boolValue': 93 elif column_effective_value_type == 'boolValue':
97 column_type = ['null', 'boolean', 'string'] 94 col_properties = {'type': ['null', 'boolean', 'string']}
98 column_gs_type = 'boolValue' 95 column_gs_type = 'boolValue'
99 elif column_effective_value_type == 'numberValue': 96 elif column_effective_value_type == 'numberValue':
100 if column_number_format_type == 'DATE_TIME': 97 if column_number_format_type == 'DATE_TIME':
101 column_type = ['null', 'string'] 98 col_properties = {
102 column_format = 'date-time' 99 'type': ['null', 'string'],
100 'format': 'date-time'
101 }
103 column_gs_type = 'numberType.DATE_TIME' 102 column_gs_type = 'numberType.DATE_TIME'
104 elif column_number_format_type == 'DATE': 103 elif column_number_format_type == 'DATE':
105 column_type = ['null', 'string'] 104 col_properties = {
106 column_format = 'date' 105 'type': ['null', 'string'],
106 'format': 'date'
107 }
107 column_gs_type = 'numberType.DATE' 108 column_gs_type = 'numberType.DATE'
108 elif column_number_format_type == 'TIME': 109 elif column_number_format_type == 'TIME':
109 column_type = ['null', 'string'] 110 col_properties = {
110 column_format = 'time' 111 'type': ['null', 'string'],
112 'format': 'time'
113 }
111 column_gs_type = 'numberType.TIME' 114 column_gs_type = 'numberType.TIME'
112 elif column_number_format_type == 'TEXT': 115 elif column_number_format_type == 'TEXT':
113 column_type = ['null', 'string'] 116 col_properties = {'type': ['null', 'string']}
114 column_gs_type = 'stringValue' 117 column_gs_type = 'stringValue'
115 else: 118 else:
116 column_type = ['null', 'number', 'string'] 119 # Interesting - order in the anyOf makes a difference.
120 # Number w/ multipleOf must be listed last, otherwise errors occur.
121 col_properties = {
122 'anyOf': [
123 {
124 'type': 'string'
125 },
126 {
127 'type': 'null'
128 },
129 {
130 'type': 'number',
131 'multipleOf': 1e-15
132 }
133 ]
134 }
117 column_gs_type = 'numberType' 135 column_gs_type = 'numberType'
118 elif column_effective_value_type in ('formulaValue', 'errorValue'): 136 # Catch-all to deal with other types and set to string
119 raise Exception('INVALID DATA TYPE ERROR: {}, value: {}'.format(column_name, \ 137 # column_effective_value_type: formulaValue, errorValue, or other
138 else:
139 col_properties = {'type': ['null', 'string']}
140 column_gs_type = 'unsupportedValue'
141 LOGGER.info('Unsupported data type: {}, value: {}'.format(column_name, \
120 column_effective_value_type)) 142 column_effective_value_type))
143 LOGGER.info('Converting to string.')
121 else: # skipped 144 else: # skipped
122 column_is_skipped = True 145 column_is_skipped = True
123 skipped = skipped + 1 146 skipped = skipped + 1
124 column_index_str = str(column_index).zfill(2) 147 column_index_str = str(column_index).zfill(2)
125 column_name = '__sdc_skip_col_{}'.format(column_index_str) 148 column_name = '__sdc_skip_col_{}'.format(column_index_str)
126 column_type = ['null', 'string'] 149 col_properties = {'type': ['null', 'string']}
127 column_format = None
128 column_gs_type = 'stringValue' 150 column_gs_type = 'stringValue'
129 151
130 if skipped >= 2: 152 if skipped >= 2:
@@ -144,10 +166,7 @@ def get_sheet_schema_columns(sheet):
144 } 166 }
145 columns.append(column) 167 columns.append(column)
146 168
147 sheet_json_schema['properties'][column_name] = column 169 sheet_json_schema['properties'][column_name] = col_properties
148 sheet_json_schema['properties'][column_name]['type'] = column_type
149 if column_format:
150 sheet_json_schema['properties'][column_name]['format'] = column_format
151 170
152 prior_header = column_name 171 prior_header = column_name
153 i = i + 1 172 i = i + 1
@@ -155,6 +174,10 @@ def get_sheet_schema_columns(sheet):
155 return sheet_json_schema, columns 174 return sheet_json_schema, columns
156 175
157 176
177# Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query
178# endpoint: spreadsheets/{spreadsheet_id}
179# params: includeGridData = true, ranges = '{sheet_title}'!1:2
180# This endpoint includes detailed metadata about each cell - incl. data type, formatting, etc.
158def get_sheet_metadata(sheet, spreadsheet_id, client): 181def get_sheet_metadata(sheet, spreadsheet_id, client):
159 sheet_id = sheet.get('properties', {}).get('sheetId') 182 sheet_id = sheet.get('properties', {}).get('sheetId')
160 sheet_title = sheet.get('properties', {}).get('title') 183 sheet_title = sheet.get('properties', {}).get('title')
@@ -170,10 +193,13 @@ def get_sheet_metadata(sheet, spreadsheet_id, client):
170 spreadsheet_id), querystring) 193 spreadsheet_id), querystring)
171 194
172 sheet_md_results = client.get(path=path, api=api, endpoint=stream_name) 195 sheet_md_results = client.get(path=path, api=api, endpoint=stream_name)
173 sheet_cols = sheet_md_results.get('sheets')[0] 196 # sheet_metadata: 1st `sheets` node in results
174 sheet_schema, columns = get_sheet_schema_columns(sheet_cols) 197 sheet_metadata = sheet_md_results.get('sheets')[0]
175 198
176 return sheet_schema, columns 199 # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results)
200 sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata)
201
202 return sheet_json_schema, columns
177 203
178 204
179def get_abs_path(path): 205def get_abs_path(path):
@@ -209,20 +235,23 @@ def get_schemas(client, spreadsheet_id):
209 path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ 235 path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
210 spreadsheet_id), querystring) 236 spreadsheet_id), querystring)
211 237
238 # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet)
212 spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \ 239 spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \
213 endpoint=stream_name) 240 endpoint=stream_name)
214 241
215 sheets = spreadsheet_md_results.get('sheets') 242 sheets = spreadsheet_md_results.get('sheets')
216 if sheets: 243 if sheets:
244 # Loop thru each worksheet in spreadsheet
217 for sheet in sheets: 245 for sheet in sheets:
218 sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) 246 # GET sheet_json_schema for each worksheet (from function above)
247 sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
219 LOGGER.info('columns = {}'.format(columns)) 248 LOGGER.info('columns = {}'.format(columns))
220 249
221 sheet_title = sheet.get('properties', {}).get('title') 250 sheet_title = sheet.get('properties', {}).get('title')
222 schemas[sheet_title] = sheet_schema 251 schemas[sheet_title] = sheet_json_schema
223 sheet_mdata = metadata.new() 252 sheet_mdata = metadata.new()
224 sheet_mdata = metadata.get_standard_metadata( 253 sheet_mdata = metadata.get_standard_metadata(
225 schema=sheet_schema, 254 schema=sheet_json_schema,
226 key_properties=['__sdc_row'], 255 key_properties=['__sdc_row'],
227 valid_replication_keys=None, 256 valid_replication_keys=None,
228 replication_method='FULL_TABLE' 257 replication_method='FULL_TABLE'