aboutsummaryrefslogtreecommitdiffhomepage
path: root/tap_google_sheets/schema.py
diff options
context:
space:
mode:
Diffstat (limited to 'tap_google_sheets/schema.py')
-rw-r--r--tap_google_sheets/schema.py56
1 files changed, 30 insertions, 26 deletions
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py
index 237ab06..d4fead5 100644
--- a/tap_google_sheets/schema.py
+++ b/tap_google_sheets/schema.py
@@ -11,19 +11,19 @@ LOGGER = singer.get_logger()
11# https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata 11# https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata
12 12
13# Convert column index to column letter 13# Convert column index to column letter
14def colnum_string(n): 14def colnum_string(num):
15 string = "" 15 string = ""
16 while n > 0: 16 while num > 0:
17 n, remainder = divmod(n - 1, 26) 17 num, remainder = divmod(num - 1, 26)
18 string = chr(65 + remainder) + string 18 string = chr(65 + remainder) + string
19 return string 19 return string
20 20
21 21
22# Create sheet_metadata_json with columns from sheet 22# Create sheet_metadata_json with columns from sheet
23def get_sheet_schema_columns(sheet, spreadsheet_id, client): 23def get_sheet_schema_columns(sheet):
24 sheet_json_schema = OrderedDict() 24 sheet_json_schema = OrderedDict()
25 data = next(iter(sheet.get('data', [])), {}) 25 data = next(iter(sheet.get('data', [])), {})
26 row_data = data.get('rowData',[]) 26 row_data = data.get('rowData', [])
27 # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse 27 # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
28 28
29 headers = row_data[0].get('values', []) 29 headers = row_data[0].get('values', [])
@@ -65,33 +65,32 @@ def get_sheet_schema_columns(sheet, spreadsheet_id, client):
65 column_name = '{}'.format(header_value) 65 column_name = '{}'.format(header_value)
66 if column_name in header_list: 66 if column_name in header_list:
67 raise Exception('DUPLICATE HEADER ERROR: {}'.format(column_name)) 67 raise Exception('DUPLICATE HEADER ERROR: {}'.format(column_name))
68 else: 68 header_list.append(column_name)
69 header_list.append(column_name)
70 69
71 first_value = first_values[i] 70 first_value = first_values[i]
72 # LOGGER.info('first_value[{}] = {}'.format(i, json.dumps(first_value, indent=2, sort_keys=True)))
73 71
74 column_effective_value = first_value.get('effectiveValue', {}) 72 column_effective_value = first_value.get('effectiveValue', {})
75 for key in column_effective_value.keys(): 73 for key in column_effective_value.keys():
76 if key in ('numberValue', 'stringValue', 'boolValue', 'errorType', 'formulaType'): 74 if key in ('numberValue', 'stringValue', 'boolValue', 'errorType', 'formulaType'):
77 column_effective_value_type = key 75 column_effective_value_type = key
78 76
79 column_number_format = first_values[i].get('effectiveFormat', {}).get('numberFormat', {}) 77 column_number_format = first_values[i].get('effectiveFormat', {}).get(
78 'numberFormat', {})
80 column_number_format_type = column_number_format.get('type') 79 column_number_format_type = column_number_format.get('type')
81 80
82 # Determine datatype for sheet_json_schema 81 # Determine datatype for sheet_json_schema
83 # 82 #
84 # column_effective_value_type = numberValue, stringValue, boolValue; INVALID: errorType, formulaType 83 # column_effective_value_type = numberValue, stringValue, boolValue;
85 # Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue 84 # INVALID: errorType, formulaType
85 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
86 # 86 #
87 # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, TIME, DATE_TIME, SCIENTIFIC 87 # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
88 # Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType 88 # TIME, DATE_TIME, SCIENTIFIC
89 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
89 # 90 #
90 column_format = None # Default 91 column_format = None # Default
91 # column_multiple_of = None # Default 92 # column_multiple_of = None # Default
92 if column_effective_value_type in ('formulaValue', 'errorValue'): 93 if column_effective_value_type == 'stringValue':
93 raise Exception('INVALID DATA TYPE ERROR: {}, value: {}'.format(column_name))
94 elif column_effective_value_type == 'stringValue':
95 column_type = ['null', 'string'] 94 column_type = ['null', 'string']
96 column_gs_type = 'stringValue' 95 column_gs_type = 'stringValue'
97 elif column_effective_value_type == 'boolValue': 96 elif column_effective_value_type == 'boolValue':
@@ -116,7 +115,9 @@ def get_sheet_schema_columns(sheet, spreadsheet_id, client):
116 else: 115 else:
117 column_type = ['null', 'number', 'string'] 116 column_type = ['null', 'number', 'string']
118 column_gs_type = 'numberType' 117 column_gs_type = 'numberType'
119 118 elif column_effective_value_type in ('formulaValue', 'errorValue'):
119 raise Exception('INVALID DATA TYPE ERROR: {}, value: {}'.format(column_name, \
120 column_effective_value_type))
120 else: # skipped 121 else: # skipped
121 column_is_skipped = True 122 column_is_skipped = True
122 skipped = skipped + 1 123 skipped = skipped + 1
@@ -130,7 +131,6 @@ def get_sheet_schema_columns(sheet, spreadsheet_id, client):
130 # skipped = 2 consecutive skipped headers 131 # skipped = 2 consecutive skipped headers
131 # Remove prior_header column_name 132 # Remove prior_header column_name
132 sheet_json_schema['properties'].pop(prior_header, None) 133 sheet_json_schema['properties'].pop(prior_header, None)
133 column_count = i - 1
134 break 134 break
135 135
136 else: 136 else:
@@ -164,12 +164,14 @@ def get_sheet_metadata(sheet, spreadsheet_id, client):
164 stream_metadata = STREAMS.get(stream_name) 164 stream_metadata = STREAMS.get(stream_name)
165 api = stream_metadata.get('api', 'sheets') 165 api = stream_metadata.get('api', 'sheets')
166 params = stream_metadata.get('params', {}) 166 params = stream_metadata.get('params', {})
167 querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]).replace('{sheet_title}', sheet_title) 167 querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \
168 path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', spreadsheet_id), querystring) 168 params.items()]).replace('{sheet_title}', sheet_title)
169 path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
170 spreadsheet_id), querystring)
169 171
170 sheet_md_results = client.get(path=path, api=api, endpoint=stream_name) 172 sheet_md_results = client.get(path=path, api=api, endpoint=stream_name)
171 sheet_cols = sheet_md_results.get('sheets')[0] 173 sheet_cols = sheet_md_results.get('sheets')[0]
172 sheet_schema, columns = get_sheet_schema_columns(sheet_cols, spreadsheet_id, client) 174 sheet_schema, columns = get_sheet_schema_columns(sheet_cols)
173 175
174 return sheet_schema, columns 176 return sheet_schema, columns
175 177
@@ -199,20 +201,22 @@ def get_schemas(client, spreadsheet_id):
199 replication_method=stream_metadata.get('replication_method', None) 201 replication_method=stream_metadata.get('replication_method', None)
200 ) 202 )
201 field_metadata[stream_name] = mdata 203 field_metadata[stream_name] = mdata
202 204
203 if stream_name == 'spreadsheet_metadata': 205 if stream_name == 'spreadsheet_metadata':
204 api = stream_metadata.get('api', 'sheets') 206 api = stream_metadata.get('api', 'sheets')
205 params = stream_metadata.get('params', {}) 207 params = stream_metadata.get('params', {})
206 querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]) 208 querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()])
207 path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', spreadsheet_id), querystring) 209 path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
210 spreadsheet_id), querystring)
208 211
209 spreadsheet_md_results = client.get(path=path, params=querystring, api=api, endpoint=stream_name) 212 spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \
213 endpoint=stream_name)
210 214
211 sheets = spreadsheet_md_results.get('sheets') 215 sheets = spreadsheet_md_results.get('sheets')
212 if sheets: 216 if sheets:
213 for sheet in sheets: 217 for sheet in sheets:
214 sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) 218 sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
215 # LOGGER.info('sheet_schema = {}'.format(json.dumps(sheet_schema, indent=2, sort_keys=True))) 219 LOGGER.info('columns = {}'.format(columns))
216 220
217 sheet_title = sheet.get('properties', {}).get('title') 221 sheet_title = sheet.get('properties', {}).get('title')
218 schemas[sheet_title] = sheet_schema 222 schemas[sheet_title] = sheet_schema
@@ -224,5 +228,5 @@ def get_schemas(client, spreadsheet_id):
224 replication_method='FULL_TABLE' 228 replication_method='FULL_TABLE'
225 ) 229 )
226 field_metadata[sheet_title] = sheet_mdata 230 field_metadata[sheet_title] = sheet_mdata
227 231
228 return schemas, field_metadata 232 return schemas, field_metadata