]>
Commit | Line | Data |
---|---|---|
89643ba6 JH |
1 | import os |
2 | import json | |
5fc2ead5 JH |
3 | import re |
4 | import urllib.parse | |
89643ba6 JH |
5 | from collections import OrderedDict |
6 | import singer | |
7 | from singer import metadata | |
8 | from tap_google_sheets.streams import STREAMS | |
9 | ||
10 | LOGGER = singer.get_logger() | |
11 | ||
12 | # Reference: | |
13 | # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata | |
14 | ||
15 | # Convert column index to column letter | |
99424fee | 16 | def colnum_string(num): |
89643ba6 | 17 | string = "" |
99424fee JH |
18 | while num > 0: |
19 | num, remainder = divmod(num - 1, 26) | |
89643ba6 JH |
20 | string = chr(65 + remainder) + string |
21 | return string | |
22 | ||
23 | ||
24 | # Create sheet_metadata_json with columns from sheet | |
99424fee | 25 | def get_sheet_schema_columns(sheet): |
43a24cba | 26 | sheet_title = sheet.get('properties', {}).get('title') |
89643ba6 JH |
27 | sheet_json_schema = OrderedDict() |
28 | data = next(iter(sheet.get('data', [])), {}) | |
99424fee | 29 | row_data = data.get('rowData', []) |
376f1145 JH |
30 | if row_data == []: |
31 | # Empty sheet, SKIP | |
32 | LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title)) | |
33 | return None, None | |
f6d7c509 | 34 | # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse |
35 | headers = row_data[0].get('values', []) | |
36 | first_values = row_data[1].get('values', []) | |
37 | # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) | |
376f1145 | 38 | |
f6d7c509 | 39 | sheet_json_schema = { |
40 | 'type': 'object', | |
41 | 'additionalProperties': False, | |
42 | 'properties': { | |
43 | '__sdc_spreadsheet_id': { | |
44 | 'type': ['null', 'string'] | |
45 | }, | |
46 | '__sdc_sheet_id': { | |
47 | 'type': ['null', 'integer'] | |
48 | }, | |
49 | '__sdc_row': { | |
50 | 'type': ['null', 'integer'] | |
89643ba6 JH |
51 | } |
52 | } | |
f6d7c509 | 53 | } |
376f1145 | 54 | |
f6d7c509 | 55 | header_list = [] # used for checking uniqueness |
56 | columns = [] | |
57 | prior_header = None | |
58 | i = 0 | |
59 | skipped = 0 | |
60 | # Read column headers until end or 2 consecutive skipped headers | |
61 | for header in headers: | |
62 | # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True))) | |
63 | column_index = i + 1 | |
64 | column_letter = colnum_string(column_index) | |
65 | header_value = header.get('formattedValue') | |
66 | if header_value: # NOT skipped | |
67 | column_is_skipped = False | |
68 | skipped = 0 | |
69 | column_name = '{}'.format(header_value) | |
70 | if column_name in header_list: | |
71 | raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format( | |
72 | sheet_title, column_name, column_letter)) | |
73 | header_list.append(column_name) | |
376f1145 | 74 | |
f6d7c509 | 75 | first_value = None |
76 | try: | |
77 | first_value = first_values[i] | |
78 | except IndexError as err: | |
5fc2ead5 | 79 | LOGGER.info('NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2. {}'.format( |
f6d7c509 | 80 | sheet_title, column_name, column_letter, err)) |
5fc2ead5 JH |
81 | first_value = {} |
82 | first_values.append(first_value) | |
83 | pass | |
376f1145 | 84 | |
f6d7c509 | 85 | column_effective_value = first_value.get('effectiveValue', {}) |
376f1145 | 86 | |
f6d7c509 | 87 | col_val = None |
88 | if column_effective_value == {}: | |
89 | column_effective_value_type = 'stringValue' | |
90 | LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format( | |
91 | sheet_title, column_name, column_letter)) | |
92 | LOGGER.info(' Setting column datatype to STRING') | |
93 | else: | |
94 | for key, val in column_effective_value.items(): | |
95 | if key in ('numberValue', 'stringValue', 'boolValue'): | |
96 | column_effective_value_type = key | |
97 | col_val = str(val) | |
98 | elif key in ('errorType', 'formulaType'): | |
99 | col_val = str(val) | |
100 | raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( | |
101 | sheet_title, column_name, column_letter, key, col_val)) | |
376f1145 | 102 | |
f6d7c509 | 103 | column_number_format = first_values[i].get('effectiveFormat', {}).get( |
104 | 'numberFormat', {}) | |
105 | column_number_format_type = column_number_format.get('type') | |
106 | ||
107 | # Determine datatype for sheet_json_schema | |
108 | # | |
109 | # column_effective_value_type = numberValue, stringValue, boolValue; | |
110 | # INVALID: errorType, formulaType | |
111 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue | |
112 | # | |
113 | # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, | |
114 | # TIME, DATE_TIME, SCIENTIFIC | |
115 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType | |
116 | # | |
117 | column_format = None # Default | |
118 | if column_effective_value == {}: | |
119 | col_properties = {'type': ['null', 'string']} | |
120 | column_gs_type = 'stringValue' | |
121 | LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format( | |
122 | sheet_title, column_name, column_letter)) | |
123 | LOGGER.info(' Setting column datatype to STRING') | |
124 | elif column_effective_value_type == 'stringValue': | |
125 | col_properties = {'type': ['null', 'string']} | |
126 | column_gs_type = 'stringValue' | |
127 | elif column_effective_value_type == 'boolValue': | |
128 | col_properties = {'type': ['null', 'boolean', 'string']} | |
129 | column_gs_type = 'boolValue' | |
130 | elif column_effective_value_type == 'numberValue': | |
131 | if column_number_format_type == 'DATE_TIME': | |
132 | col_properties = { | |
133 | 'type': ['null', 'string'], | |
134 | 'format': 'date-time' | |
135 | } | |
136 | column_gs_type = 'numberType.DATE_TIME' | |
137 | elif column_number_format_type == 'DATE': | |
138 | col_properties = { | |
139 | 'type': ['null', 'string'], | |
140 | 'format': 'date' | |
141 | } | |
142 | column_gs_type = 'numberType.DATE' | |
143 | elif column_number_format_type == 'TIME': | |
144 | col_properties = { | |
145 | 'type': ['null', 'string'], | |
146 | 'format': 'time' | |
147 | } | |
148 | column_gs_type = 'numberType.TIME' | |
149 | elif column_number_format_type == 'TEXT': | |
5890b89c | 150 | col_properties = {'type': ['null', 'string']} |
89643ba6 JH |
151 | column_gs_type = 'stringValue' |
152 | else: | |
f6d7c509 | 153 | # Interesting - order in the anyOf makes a difference. |
154 | # Number w/ multipleOf must be listed last, otherwise errors occur. | |
155 | col_properties = { | |
156 | 'anyOf': [ | |
157 | { | |
158 | 'type': 'null' | |
159 | }, | |
160 | { | |
161 | 'type': 'number', | |
162 | 'multipleOf': 1e-15 | |
163 | }, | |
164 | { | |
165 | 'type': 'string' | |
166 | } | |
167 | ] | |
168 | } | |
169 | column_gs_type = 'numberType' | |
170 | # Catch-all to deal with other types and set to string | |
171 | # column_effective_value_type: formulaValue, errorValue, or other | |
172 | else: | |
5890b89c | 173 | col_properties = {'type': ['null', 'string']} |
f6d7c509 | 174 | column_gs_type = 'unsupportedValue' |
175 | LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( | |
176 | sheet_title, column_name, column_letter, column_effective_value_type, col_val)) | |
177 | LOGGER.info('Converting to string.') | |
178 | else: # skipped | |
179 | column_is_skipped = True | |
180 | skipped = skipped + 1 | |
181 | column_index_str = str(column_index).zfill(2) | |
182 | column_name = '__sdc_skip_col_{}'.format(column_index_str) | |
183 | col_properties = {'type': ['null', 'string']} | |
184 | column_gs_type = 'stringValue' | |
185 | LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format( | |
186 | sheet_title, column_name, column_letter)) | |
187 | LOGGER.info(' This column will be skipped during data loading.') | |
89643ba6 | 188 | |
f6d7c509 | 189 | if skipped >= 2: |
190 | # skipped = 2 consecutive skipped headers | |
191 | # Remove prior_header column_name | |
192 | sheet_json_schema['properties'].pop(prior_header, None) | |
193 | LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format( | |
194 | sheet_title, column_name, column_letter)) | |
195 | break | |
89643ba6 | 196 | |
f6d7c509 | 197 | else: |
198 | column = {} | |
199 | column = { | |
200 | 'columnIndex': column_index, | |
201 | 'columnLetter': column_letter, | |
202 | 'columnName': column_name, | |
203 | 'columnType': column_gs_type, | |
204 | 'columnSkipped': column_is_skipped | |
205 | } | |
206 | columns.append(column) | |
89643ba6 | 207 | |
f6d7c509 | 208 | sheet_json_schema['properties'][column_name] = col_properties |
376f1145 | 209 | |
f6d7c509 | 210 | prior_header = column_name |
211 | i = i + 1 | |
376f1145 | 212 | |
f6d7c509 | 213 | return sheet_json_schema, columns |
89643ba6 JH |
214 | |
215 | ||
5890b89c JH |
216 | # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query |
217 | # endpoint: spreadsheets/{spreadsheet_id} | |
218 | # params: includeGridData = true, ranges = '{sheet_title}'!1:2 | |
219 | # This endpoint includes detailed metadata about each cell - incl. data type, formatting, etc. | |
89643ba6 JH |
220 | def get_sheet_metadata(sheet, spreadsheet_id, client): |
221 | sheet_id = sheet.get('properties', {}).get('sheetId') | |
222 | sheet_title = sheet.get('properties', {}).get('title') | |
223 | LOGGER.info('sheet_id = {}, sheet_title = {}'.format(sheet_id, sheet_title)) | |
224 | ||
225 | stream_name = 'sheet_metadata' | |
226 | stream_metadata = STREAMS.get(stream_name) | |
227 | api = stream_metadata.get('api', 'sheets') | |
228 | params = stream_metadata.get('params', {}) | |
5fc2ead5 JH |
229 | sheet_title_encoded = urllib.parse.quote_plus(sheet_title) |
230 | sheet_title_escaped = re.escape(sheet_title) | |
99424fee | 231 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \ |
5fc2ead5 | 232 | params.items()]).replace('{sheet_title}', sheet_title_encoded) |
99424fee JH |
233 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ |
234 | spreadsheet_id), querystring) | |
89643ba6 | 235 | |
5fc2ead5 | 236 | sheet_md_results = client.get(path=path, api=api, endpoint=sheet_title_escaped) |
5890b89c JH |
237 | # sheet_metadata: 1st `sheets` node in results |
238 | sheet_metadata = sheet_md_results.get('sheets')[0] | |
89643ba6 | 239 | |
5890b89c | 240 | # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results) |
f6d7c509 | 241 | try: |
242 | sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata) | |
5fc2ead5 JH |
243 | except Exception as err: |
244 | LOGGER.warning('{}'.format(err)) | |
245 | LOGGER.warning('SKIPPING Malformed sheet: {}'.format(sheet_title)) | |
f6d7c509 | 246 | sheet_json_schema, columns = None, None |
5890b89c JH |
247 | |
248 | return sheet_json_schema, columns | |
89643ba6 JH |
249 | |
250 | ||
251 | def get_abs_path(path): | |
252 | return os.path.join(os.path.dirname(os.path.realpath(__file__)), path) | |
253 | ||
254 | def get_schemas(client, spreadsheet_id): | |
255 | schemas = {} | |
256 | field_metadata = {} | |
257 | ||
258 | for stream_name, stream_metadata in STREAMS.items(): | |
259 | schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) | |
260 | with open(schema_path) as file: | |
261 | schema = json.load(file) | |
262 | schemas[stream_name] = schema | |
263 | mdata = metadata.new() | |
264 | ||
265 | # Documentation: | |
266 | # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions | |
267 | # Reference: | |
268 | # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44 | |
269 | mdata = metadata.get_standard_metadata( | |
270 | schema=schema, | |
271 | key_properties=stream_metadata.get('key_properties', None), | |
272 | valid_replication_keys=stream_metadata.get('replication_keys', None), | |
273 | replication_method=stream_metadata.get('replication_method', None) | |
274 | ) | |
275 | field_metadata[stream_name] = mdata | |
99424fee | 276 | |
89643ba6 JH |
277 | if stream_name == 'spreadsheet_metadata': |
278 | api = stream_metadata.get('api', 'sheets') | |
279 | params = stream_metadata.get('params', {}) | |
280 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]) | |
99424fee JH |
281 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ |
282 | spreadsheet_id), querystring) | |
89643ba6 | 283 | |
5890b89c | 284 | # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet) |
99424fee JH |
285 | spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \ |
286 | endpoint=stream_name) | |
89643ba6 JH |
287 | |
288 | sheets = spreadsheet_md_results.get('sheets') | |
289 | if sheets: | |
5890b89c | 290 | # Loop thru each worksheet in spreadsheet |
89643ba6 | 291 | for sheet in sheets: |
5890b89c JH |
292 | # GET sheet_json_schema for each worksheet (from function above) |
293 | sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) | |
376f1145 JH |
294 | |
295 | # SKIP empty sheets (where sheet_json_schema and columns are None) | |
296 | if sheet_json_schema and columns: | |
297 | sheet_title = sheet.get('properties', {}).get('title') | |
298 | schemas[sheet_title] = sheet_json_schema | |
299 | sheet_mdata = metadata.new() | |
300 | sheet_mdata = metadata.get_standard_metadata( | |
301 | schema=sheet_json_schema, | |
302 | key_properties=['__sdc_row'], | |
303 | valid_replication_keys=None, | |
304 | replication_method='FULL_TABLE' | |
305 | ) | |
306 | field_metadata[sheet_title] = sheet_mdata | |
99424fee | 307 | |
89643ba6 | 308 | return schemas, field_metadata |