]>
Commit | Line | Data |
---|---|---|
89643ba6 JH |
1 | import os |
2 | import json | |
3 | from collections import OrderedDict | |
4 | import singer | |
5 | from singer import metadata | |
6 | from tap_google_sheets.streams import STREAMS | |
7 | ||
8 | LOGGER = singer.get_logger() | |
9 | ||
10 | # Reference: | |
11 | # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata | |
12 | ||
13 | # Convert column index to column letter | |
99424fee | 14 | def colnum_string(num): |
89643ba6 | 15 | string = "" |
99424fee JH |
16 | while num > 0: |
17 | num, remainder = divmod(num - 1, 26) | |
89643ba6 JH |
18 | string = chr(65 + remainder) + string |
19 | return string | |
20 | ||
21 | ||
22 | # Create sheet_metadata_json with columns from sheet | |
99424fee | 23 | def get_sheet_schema_columns(sheet): |
43a24cba | 24 | sheet_title = sheet.get('properties', {}).get('title') |
89643ba6 JH |
25 | sheet_json_schema = OrderedDict() |
26 | data = next(iter(sheet.get('data', [])), {}) | |
99424fee | 27 | row_data = data.get('rowData', []) |
376f1145 JH |
28 | if row_data == []: |
29 | # Empty sheet, SKIP | |
30 | LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title)) | |
31 | return None, None | |
f6d7c509 | 32 | # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse |
33 | headers = row_data[0].get('values', []) | |
34 | first_values = row_data[1].get('values', []) | |
35 | # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) | |
376f1145 | 36 | |
f6d7c509 | 37 | sheet_json_schema = { |
38 | 'type': 'object', | |
39 | 'additionalProperties': False, | |
40 | 'properties': { | |
41 | '__sdc_spreadsheet_id': { | |
42 | 'type': ['null', 'string'] | |
43 | }, | |
44 | '__sdc_sheet_id': { | |
45 | 'type': ['null', 'integer'] | |
46 | }, | |
47 | '__sdc_row': { | |
48 | 'type': ['null', 'integer'] | |
89643ba6 JH |
49 | } |
50 | } | |
f6d7c509 | 51 | } |
376f1145 | 52 | |
f6d7c509 | 53 | header_list = [] # used for checking uniqueness |
54 | columns = [] | |
55 | prior_header = None | |
56 | i = 0 | |
57 | skipped = 0 | |
58 | # Read column headers until end or 2 consecutive skipped headers | |
59 | for header in headers: | |
60 | # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True))) | |
61 | column_index = i + 1 | |
62 | column_letter = colnum_string(column_index) | |
63 | header_value = header.get('formattedValue') | |
64 | if header_value: # NOT skipped | |
65 | column_is_skipped = False | |
66 | skipped = 0 | |
67 | column_name = '{}'.format(header_value) | |
68 | if column_name in header_list: | |
69 | raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format( | |
70 | sheet_title, column_name, column_letter)) | |
71 | header_list.append(column_name) | |
376f1145 | 72 | |
f6d7c509 | 73 | first_value = None |
74 | try: | |
75 | first_value = first_values[i] | |
76 | except IndexError as err: | |
77 | raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format( | |
78 | sheet_title, column_name, column_letter, err)) | |
376f1145 | 79 | |
f6d7c509 | 80 | column_effective_value = first_value.get('effectiveValue', {}) |
376f1145 | 81 | |
f6d7c509 | 82 | col_val = None |
83 | if column_effective_value == {}: | |
84 | column_effective_value_type = 'stringValue' | |
85 | LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format( | |
86 | sheet_title, column_name, column_letter)) | |
87 | LOGGER.info(' Setting column datatype to STRING') | |
88 | else: | |
89 | for key, val in column_effective_value.items(): | |
90 | if key in ('numberValue', 'stringValue', 'boolValue'): | |
91 | column_effective_value_type = key | |
92 | col_val = str(val) | |
93 | elif key in ('errorType', 'formulaType'): | |
94 | col_val = str(val) | |
95 | raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( | |
96 | sheet_title, column_name, column_letter, key, col_val)) | |
376f1145 | 97 | |
f6d7c509 | 98 | column_number_format = first_values[i].get('effectiveFormat', {}).get( |
99 | 'numberFormat', {}) | |
100 | column_number_format_type = column_number_format.get('type') | |
101 | ||
102 | # Determine datatype for sheet_json_schema | |
103 | # | |
104 | # column_effective_value_type = numberValue, stringValue, boolValue; | |
105 | # INVALID: errorType, formulaType | |
106 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue | |
107 | # | |
108 | # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, | |
109 | # TIME, DATE_TIME, SCIENTIFIC | |
110 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType | |
111 | # | |
112 | column_format = None # Default | |
113 | if column_effective_value == {}: | |
114 | col_properties = {'type': ['null', 'string']} | |
115 | column_gs_type = 'stringValue' | |
116 | LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format( | |
117 | sheet_title, column_name, column_letter)) | |
118 | LOGGER.info(' Setting column datatype to STRING') | |
119 | elif column_effective_value_type == 'stringValue': | |
120 | col_properties = {'type': ['null', 'string']} | |
121 | column_gs_type = 'stringValue' | |
122 | elif column_effective_value_type == 'boolValue': | |
123 | col_properties = {'type': ['null', 'boolean', 'string']} | |
124 | column_gs_type = 'boolValue' | |
125 | elif column_effective_value_type == 'numberValue': | |
126 | if column_number_format_type == 'DATE_TIME': | |
127 | col_properties = { | |
128 | 'type': ['null', 'string'], | |
129 | 'format': 'date-time' | |
130 | } | |
131 | column_gs_type = 'numberType.DATE_TIME' | |
132 | elif column_number_format_type == 'DATE': | |
133 | col_properties = { | |
134 | 'type': ['null', 'string'], | |
135 | 'format': 'date' | |
136 | } | |
137 | column_gs_type = 'numberType.DATE' | |
138 | elif column_number_format_type == 'TIME': | |
139 | col_properties = { | |
140 | 'type': ['null', 'string'], | |
141 | 'format': 'time' | |
142 | } | |
143 | column_gs_type = 'numberType.TIME' | |
144 | elif column_number_format_type == 'TEXT': | |
5890b89c | 145 | col_properties = {'type': ['null', 'string']} |
89643ba6 JH |
146 | column_gs_type = 'stringValue' |
147 | else: | |
f6d7c509 | 148 | # Interesting - order in the anyOf makes a difference. |
149 | # Number w/ multipleOf must be listed last, otherwise errors occur. | |
150 | col_properties = { | |
151 | 'anyOf': [ | |
152 | { | |
153 | 'type': 'null' | |
154 | }, | |
155 | { | |
156 | 'type': 'number', | |
157 | 'multipleOf': 1e-15 | |
158 | }, | |
159 | { | |
160 | 'type': 'string' | |
161 | } | |
162 | ] | |
163 | } | |
164 | column_gs_type = 'numberType' | |
165 | # Catch-all to deal with other types and set to string | |
166 | # column_effective_value_type: formulaValue, errorValue, or other | |
167 | else: | |
5890b89c | 168 | col_properties = {'type': ['null', 'string']} |
f6d7c509 | 169 | column_gs_type = 'unsupportedValue' |
170 | LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( | |
171 | sheet_title, column_name, column_letter, column_effective_value_type, col_val)) | |
172 | LOGGER.info('Converting to string.') | |
173 | else: # skipped | |
174 | column_is_skipped = True | |
175 | skipped = skipped + 1 | |
176 | column_index_str = str(column_index).zfill(2) | |
177 | column_name = '__sdc_skip_col_{}'.format(column_index_str) | |
178 | col_properties = {'type': ['null', 'string']} | |
179 | column_gs_type = 'stringValue' | |
180 | LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format( | |
181 | sheet_title, column_name, column_letter)) | |
182 | LOGGER.info(' This column will be skipped during data loading.') | |
89643ba6 | 183 | |
f6d7c509 | 184 | if skipped >= 2: |
185 | # skipped = 2 consecutive skipped headers | |
186 | # Remove prior_header column_name | |
187 | sheet_json_schema['properties'].pop(prior_header, None) | |
188 | LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format( | |
189 | sheet_title, column_name, column_letter)) | |
190 | break | |
89643ba6 | 191 | |
f6d7c509 | 192 | else: |
193 | column = {} | |
194 | column = { | |
195 | 'columnIndex': column_index, | |
196 | 'columnLetter': column_letter, | |
197 | 'columnName': column_name, | |
198 | 'columnType': column_gs_type, | |
199 | 'columnSkipped': column_is_skipped | |
200 | } | |
201 | columns.append(column) | |
89643ba6 | 202 | |
f6d7c509 | 203 | sheet_json_schema['properties'][column_name] = col_properties |
376f1145 | 204 | |
f6d7c509 | 205 | prior_header = column_name |
206 | i = i + 1 | |
376f1145 | 207 | |
f6d7c509 | 208 | return sheet_json_schema, columns |
89643ba6 JH |
209 | |
210 | ||
5890b89c JH |
211 | # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query |
212 | # endpoint: spreadsheets/{spreadsheet_id} | |
213 | # params: includeGridData = true, ranges = '{sheet_title}'!1:2 | |
214 | # This endpoint includes detailed metadata about each cell - incl. data type, formatting, etc. | |
89643ba6 JH |
215 | def get_sheet_metadata(sheet, spreadsheet_id, client): |
216 | sheet_id = sheet.get('properties', {}).get('sheetId') | |
217 | sheet_title = sheet.get('properties', {}).get('title') | |
218 | LOGGER.info('sheet_id = {}, sheet_title = {}'.format(sheet_id, sheet_title)) | |
219 | ||
220 | stream_name = 'sheet_metadata' | |
221 | stream_metadata = STREAMS.get(stream_name) | |
222 | api = stream_metadata.get('api', 'sheets') | |
223 | params = stream_metadata.get('params', {}) | |
99424fee JH |
224 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \ |
225 | params.items()]).replace('{sheet_title}', sheet_title) | |
226 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ | |
227 | spreadsheet_id), querystring) | |
89643ba6 JH |
228 | |
229 | sheet_md_results = client.get(path=path, api=api, endpoint=stream_name) | |
5890b89c JH |
230 | # sheet_metadata: 1st `sheets` node in results |
231 | sheet_metadata = sheet_md_results.get('sheets')[0] | |
89643ba6 | 232 | |
5890b89c | 233 | # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results) |
f6d7c509 | 234 | try: |
235 | sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata) | |
236 | except: | |
237 | LOGGER.info('SKIPPING Malformed sheet: {}'.format(sheet_title)) | |
238 | sheet_json_schema, columns = None, None | |
5890b89c JH |
239 | |
240 | return sheet_json_schema, columns | |
89643ba6 JH |
241 | |
242 | ||
243 | def get_abs_path(path): | |
244 | return os.path.join(os.path.dirname(os.path.realpath(__file__)), path) | |
245 | ||
246 | def get_schemas(client, spreadsheet_id): | |
247 | schemas = {} | |
248 | field_metadata = {} | |
249 | ||
250 | for stream_name, stream_metadata in STREAMS.items(): | |
251 | schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) | |
252 | with open(schema_path) as file: | |
253 | schema = json.load(file) | |
254 | schemas[stream_name] = schema | |
255 | mdata = metadata.new() | |
256 | ||
257 | # Documentation: | |
258 | # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions | |
259 | # Reference: | |
260 | # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44 | |
261 | mdata = metadata.get_standard_metadata( | |
262 | schema=schema, | |
263 | key_properties=stream_metadata.get('key_properties', None), | |
264 | valid_replication_keys=stream_metadata.get('replication_keys', None), | |
265 | replication_method=stream_metadata.get('replication_method', None) | |
266 | ) | |
267 | field_metadata[stream_name] = mdata | |
99424fee | 268 | |
89643ba6 JH |
269 | if stream_name == 'spreadsheet_metadata': |
270 | api = stream_metadata.get('api', 'sheets') | |
271 | params = stream_metadata.get('params', {}) | |
272 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]) | |
99424fee JH |
273 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ |
274 | spreadsheet_id), querystring) | |
89643ba6 | 275 | |
5890b89c | 276 | # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet) |
99424fee JH |
277 | spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \ |
278 | endpoint=stream_name) | |
89643ba6 JH |
279 | |
280 | sheets = spreadsheet_md_results.get('sheets') | |
281 | if sheets: | |
5890b89c | 282 | # Loop thru each worksheet in spreadsheet |
89643ba6 | 283 | for sheet in sheets: |
5890b89c JH |
284 | # GET sheet_json_schema for each worksheet (from function above) |
285 | sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) | |
376f1145 JH |
286 | |
287 | # SKIP empty sheets (where sheet_json_schema and columns are None) | |
288 | if sheet_json_schema and columns: | |
289 | sheet_title = sheet.get('properties', {}).get('title') | |
290 | schemas[sheet_title] = sheet_json_schema | |
291 | sheet_mdata = metadata.new() | |
292 | sheet_mdata = metadata.get_standard_metadata( | |
293 | schema=sheet_json_schema, | |
294 | key_properties=['__sdc_row'], | |
295 | valid_replication_keys=None, | |
296 | replication_method='FULL_TABLE' | |
297 | ) | |
298 | field_metadata[sheet_title] = sheet_mdata | |
99424fee | 299 | |
89643ba6 | 300 | return schemas, field_metadata |