]> git.immae.eu Git - github/fretlink/tap-google-sheets.git/blame - tap_google_sheets/schema.py
Allow all schemas to fall back to a string value (#25)
[github/fretlink/tap-google-sheets.git] / tap_google_sheets / schema.py
CommitLineData
89643ba6
JH
1import os
2import json
5fc2ead5
JH
3import re
4import urllib.parse
89643ba6
JH
5from collections import OrderedDict
6import singer
7from singer import metadata
8from tap_google_sheets.streams import STREAMS
9
10LOGGER = singer.get_logger()
11
12# Reference:
13# https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata
14
15# Convert column index to column letter
99424fee 16def colnum_string(num):
89643ba6 17 string = ""
99424fee
JH
18 while num > 0:
19 num, remainder = divmod(num - 1, 26)
89643ba6
JH
20 string = chr(65 + remainder) + string
21 return string
22
23
24# Create sheet_metadata_json with columns from sheet
99424fee 25def get_sheet_schema_columns(sheet):
43a24cba 26 sheet_title = sheet.get('properties', {}).get('title')
89643ba6
JH
27 sheet_json_schema = OrderedDict()
28 data = next(iter(sheet.get('data', [])), {})
99424fee 29 row_data = data.get('rowData', [])
376f1145
JH
30 if row_data == []:
31 # Empty sheet, SKIP
32 LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title))
33 return None, None
f6d7c509 34 # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
35 headers = row_data[0].get('values', [])
36 first_values = row_data[1].get('values', [])
37 # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
376f1145 38
f6d7c509 39 sheet_json_schema = {
40 'type': 'object',
41 'additionalProperties': False,
42 'properties': {
43 '__sdc_spreadsheet_id': {
44 'type': ['null', 'string']
45 },
46 '__sdc_sheet_id': {
47 'type': ['null', 'integer']
48 },
49 '__sdc_row': {
50 'type': ['null', 'integer']
89643ba6
JH
51 }
52 }
f6d7c509 53 }
376f1145 54
f6d7c509 55 header_list = [] # used for checking uniqueness
56 columns = []
57 prior_header = None
58 i = 0
59 skipped = 0
60 # Read column headers until end or 2 consecutive skipped headers
61 for header in headers:
62 # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
63 column_index = i + 1
64 column_letter = colnum_string(column_index)
65 header_value = header.get('formattedValue')
66 if header_value: # NOT skipped
67 column_is_skipped = False
68 skipped = 0
69 column_name = '{}'.format(header_value)
70 if column_name in header_list:
71 raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
72 sheet_title, column_name, column_letter))
73 header_list.append(column_name)
376f1145 74
f6d7c509 75 first_value = None
76 try:
77 first_value = first_values[i]
78 except IndexError as err:
5fc2ead5 79 LOGGER.info('NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
f6d7c509 80 sheet_title, column_name, column_letter, err))
5fc2ead5
JH
81 first_value = {}
82 first_values.append(first_value)
83 pass
376f1145 84
f6d7c509 85 column_effective_value = first_value.get('effectiveValue', {})
376f1145 86
f6d7c509 87 col_val = None
88 if column_effective_value == {}:
89 column_effective_value_type = 'stringValue'
90 LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
91 sheet_title, column_name, column_letter))
92 LOGGER.info(' Setting column datatype to STRING')
93 else:
94 for key, val in column_effective_value.items():
95 if key in ('numberValue', 'stringValue', 'boolValue'):
96 column_effective_value_type = key
97 col_val = str(val)
98 elif key in ('errorType', 'formulaType'):
99 col_val = str(val)
100 raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
101 sheet_title, column_name, column_letter, key, col_val))
376f1145 102
f6d7c509 103 column_number_format = first_values[i].get('effectiveFormat', {}).get(
104 'numberFormat', {})
105 column_number_format_type = column_number_format.get('type')
106
107 # Determine datatype for sheet_json_schema
108 #
109 # column_effective_value_type = numberValue, stringValue, boolValue;
110 # INVALID: errorType, formulaType
111 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
112 #
113 # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
114 # TIME, DATE_TIME, SCIENTIFIC
115 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
116 #
117 column_format = None # Default
118 if column_effective_value == {}:
119 col_properties = {'type': ['null', 'string']}
120 column_gs_type = 'stringValue'
121 LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
122 sheet_title, column_name, column_letter))
123 LOGGER.info(' Setting column datatype to STRING')
124 elif column_effective_value_type == 'stringValue':
125 col_properties = {'type': ['null', 'string']}
126 column_gs_type = 'stringValue'
127 elif column_effective_value_type == 'boolValue':
128 col_properties = {'type': ['null', 'boolean', 'string']}
129 column_gs_type = 'boolValue'
130 elif column_effective_value_type == 'numberValue':
131 if column_number_format_type == 'DATE_TIME':
132 col_properties = {
133 'type': ['null', 'string'],
134 'format': 'date-time'
135 }
136 column_gs_type = 'numberType.DATE_TIME'
137 elif column_number_format_type == 'DATE':
138 col_properties = {
139 'type': ['null', 'string'],
140 'format': 'date'
141 }
142 column_gs_type = 'numberType.DATE'
143 elif column_number_format_type == 'TIME':
144 col_properties = {
145 'type': ['null', 'string'],
146 'format': 'time'
147 }
148 column_gs_type = 'numberType.TIME'
149 elif column_number_format_type == 'TEXT':
5890b89c 150 col_properties = {'type': ['null', 'string']}
89643ba6
JH
151 column_gs_type = 'stringValue'
152 else:
f6d7c509 153 # Interesting - order in the anyOf makes a difference.
154 # Number w/ multipleOf must be listed last, otherwise errors occur.
80f97522 155 col_properties = {'type': 'number', 'multipleOf': 1e-15}
f6d7c509 156 column_gs_type = 'numberType'
157 # Catch-all to deal with other types and set to string
158 # column_effective_value_type: formulaValue, errorValue, or other
159 else:
5890b89c 160 col_properties = {'type': ['null', 'string']}
f6d7c509 161 column_gs_type = 'unsupportedValue'
162 LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
163 sheet_title, column_name, column_letter, column_effective_value_type, col_val))
164 LOGGER.info('Converting to string.')
165 else: # skipped
166 column_is_skipped = True
167 skipped = skipped + 1
168 column_index_str = str(column_index).zfill(2)
169 column_name = '__sdc_skip_col_{}'.format(column_index_str)
170 col_properties = {'type': ['null', 'string']}
171 column_gs_type = 'stringValue'
172 LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
173 sheet_title, column_name, column_letter))
174 LOGGER.info(' This column will be skipped during data loading.')
89643ba6 175
f6d7c509 176 if skipped >= 2:
177 # skipped = 2 consecutive skipped headers
178 # Remove prior_header column_name
179 sheet_json_schema['properties'].pop(prior_header, None)
180 LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
181 sheet_title, column_name, column_letter))
182 break
89643ba6 183
f6d7c509 184 else:
185 column = {}
186 column = {
187 'columnIndex': column_index,
188 'columnLetter': column_letter,
189 'columnName': column_name,
190 'columnType': column_gs_type,
191 'columnSkipped': column_is_skipped
192 }
193 columns.append(column)
89643ba6 194
80f97522
AL
195 if column_gs_type in {'numberType.DATE_TIME', 'numberType.DATE', 'numberType.TIME', 'numberType'}:
196 col_properties = {
197 'anyOf': [
198 col_properties,
199 {'type': ['null', 'string']}
200 ]
201 }
202
f6d7c509 203 sheet_json_schema['properties'][column_name] = col_properties
376f1145 204
f6d7c509 205 prior_header = column_name
206 i = i + 1
376f1145 207
f6d7c509 208 return sheet_json_schema, columns
89643ba6
JH
209
210
5890b89c
JH
211# Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query
212# endpoint: spreadsheets/{spreadsheet_id}
213# params: includeGridData = true, ranges = '{sheet_title}'!1:2
214# This endpoint includes detailed metadata about each cell - incl. data type, formatting, etc.
89643ba6
JH
215def get_sheet_metadata(sheet, spreadsheet_id, client):
216 sheet_id = sheet.get('properties', {}).get('sheetId')
217 sheet_title = sheet.get('properties', {}).get('title')
218 LOGGER.info('sheet_id = {}, sheet_title = {}'.format(sheet_id, sheet_title))
219
220 stream_name = 'sheet_metadata'
221 stream_metadata = STREAMS.get(stream_name)
222 api = stream_metadata.get('api', 'sheets')
223 params = stream_metadata.get('params', {})
5fc2ead5
JH
224 sheet_title_encoded = urllib.parse.quote_plus(sheet_title)
225 sheet_title_escaped = re.escape(sheet_title)
99424fee 226 querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \
5fc2ead5 227 params.items()]).replace('{sheet_title}', sheet_title_encoded)
99424fee
JH
228 path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
229 spreadsheet_id), querystring)
89643ba6 230
5fc2ead5 231 sheet_md_results = client.get(path=path, api=api, endpoint=sheet_title_escaped)
5890b89c
JH
232 # sheet_metadata: 1st `sheets` node in results
233 sheet_metadata = sheet_md_results.get('sheets')[0]
89643ba6 234
5890b89c 235 # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results)
f6d7c509 236 try:
237 sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata)
5fc2ead5
JH
238 except Exception as err:
239 LOGGER.warning('{}'.format(err))
240 LOGGER.warning('SKIPPING Malformed sheet: {}'.format(sheet_title))
f6d7c509 241 sheet_json_schema, columns = None, None
5890b89c
JH
242
243 return sheet_json_schema, columns
89643ba6
JH
244
245
246def get_abs_path(path):
247 return os.path.join(os.path.dirname(os.path.realpath(__file__)), path)
248
249def get_schemas(client, spreadsheet_id):
250 schemas = {}
251 field_metadata = {}
252
253 for stream_name, stream_metadata in STREAMS.items():
254 schema_path = get_abs_path('schemas/{}.json'.format(stream_name))
255 with open(schema_path) as file:
256 schema = json.load(file)
257 schemas[stream_name] = schema
258 mdata = metadata.new()
259
260 # Documentation:
261 # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions
262 # Reference:
263 # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44
264 mdata = metadata.get_standard_metadata(
265 schema=schema,
266 key_properties=stream_metadata.get('key_properties', None),
267 valid_replication_keys=stream_metadata.get('replication_keys', None),
268 replication_method=stream_metadata.get('replication_method', None)
269 )
270 field_metadata[stream_name] = mdata
99424fee 271
89643ba6
JH
272 if stream_name == 'spreadsheet_metadata':
273 api = stream_metadata.get('api', 'sheets')
274 params = stream_metadata.get('params', {})
275 querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()])
99424fee
JH
276 path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
277 spreadsheet_id), querystring)
89643ba6 278
5890b89c 279 # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet)
99424fee
JH
280 spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \
281 endpoint=stream_name)
89643ba6
JH
282
283 sheets = spreadsheet_md_results.get('sheets')
284 if sheets:
5890b89c 285 # Loop thru each worksheet in spreadsheet
89643ba6 286 for sheet in sheets:
5890b89c
JH
287 # GET sheet_json_schema for each worksheet (from function above)
288 sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
376f1145
JH
289
290 # SKIP empty sheets (where sheet_json_schema and columns are None)
291 if sheet_json_schema and columns:
292 sheet_title = sheet.get('properties', {}).get('title')
293 schemas[sheet_title] = sheet_json_schema
294 sheet_mdata = metadata.new()
295 sheet_mdata = metadata.get_standard_metadata(
296 schema=sheet_json_schema,
297 key_properties=['__sdc_row'],
298 valid_replication_keys=None,
299 replication_method='FULL_TABLE'
300 )
301 field_metadata[sheet_title] = sheet_mdata
99424fee 302
89643ba6 303 return schemas, field_metadata