]>
Commit | Line | Data |
---|---|---|
89643ba6 JH |
1 | import os |
2 | import json | |
3 | from collections import OrderedDict | |
4 | import singer | |
5 | from singer import metadata | |
6 | from tap_google_sheets.streams import STREAMS | |
7 | ||
8 | LOGGER = singer.get_logger() | |
9 | ||
10 | # Reference: | |
11 | # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata | |
12 | ||
13 | # Convert column index to column letter | |
99424fee | 14 | def colnum_string(num): |
89643ba6 | 15 | string = "" |
99424fee JH |
16 | while num > 0: |
17 | num, remainder = divmod(num - 1, 26) | |
89643ba6 JH |
18 | string = chr(65 + remainder) + string |
19 | return string | |
20 | ||
21 | ||
22 | # Create sheet_metadata_json with columns from sheet | |
99424fee | 23 | def get_sheet_schema_columns(sheet): |
89643ba6 JH |
24 | sheet_json_schema = OrderedDict() |
25 | data = next(iter(sheet.get('data', [])), {}) | |
99424fee | 26 | row_data = data.get('rowData', []) |
89643ba6 JH |
27 | # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse |
28 | ||
29 | headers = row_data[0].get('values', []) | |
30 | first_values = row_data[1].get('values', []) | |
31 | # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) | |
32 | ||
89643ba6 JH |
33 | sheet_json_schema = { |
34 | 'type': 'object', | |
35 | 'additionalProperties': False, | |
36 | 'properties': { | |
37 | '__sdc_spreadsheet_id': { | |
38 | 'type': ['null', 'string'] | |
39 | }, | |
40 | '__sdc_sheet_id': { | |
41 | 'type': ['null', 'integer'] | |
42 | }, | |
43 | '__sdc_row': { | |
44 | 'type': ['null', 'integer'] | |
45 | } | |
46 | } | |
47 | } | |
48 | ||
49 | header_list = [] # used for checking uniqueness | |
50 | columns = [] | |
51 | prior_header = None | |
52 | i = 0 | |
53 | skipped = 0 | |
54 | # Read column headers until end or 2 consecutive skipped headers | |
55 | for header in headers: | |
56 | # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True))) | |
57 | column_index = i + 1 | |
58 | column_letter = colnum_string(column_index) | |
59 | header_value = header.get('formattedValue') | |
60 | if header_value: # NOT skipped | |
61 | column_is_skipped = False | |
62 | skipped = 0 | |
63 | column_name = '{}'.format(header_value) | |
64 | if column_name in header_list: | |
65 | raise Exception('DUPLICATE HEADER ERROR: {}'.format(column_name)) | |
99424fee | 66 | header_list.append(column_name) |
89643ba6 JH |
67 | |
68 | first_value = first_values[i] | |
89643ba6 JH |
69 | |
70 | column_effective_value = first_value.get('effectiveValue', {}) | |
71 | for key in column_effective_value.keys(): | |
72 | if key in ('numberValue', 'stringValue', 'boolValue', 'errorType', 'formulaType'): | |
73 | column_effective_value_type = key | |
74 | ||
99424fee JH |
75 | column_number_format = first_values[i].get('effectiveFormat', {}).get( |
76 | 'numberFormat', {}) | |
89643ba6 JH |
77 | column_number_format_type = column_number_format.get('type') |
78 | ||
79 | # Determine datatype for sheet_json_schema | |
80 | # | |
99424fee JH |
81 | # column_effective_value_type = numberValue, stringValue, boolValue; |
82 | # INVALID: errorType, formulaType | |
83 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue | |
89643ba6 | 84 | # |
99424fee JH |
85 | # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, |
86 | # TIME, DATE_TIME, SCIENTIFIC | |
87 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType | |
89643ba6 JH |
88 | # |
89 | column_format = None # Default | |
99424fee | 90 | if column_effective_value_type == 'stringValue': |
5890b89c | 91 | col_properties = {'type': ['null', 'string']} |
89643ba6 JH |
92 | column_gs_type = 'stringValue' |
93 | elif column_effective_value_type == 'boolValue': | |
5890b89c | 94 | col_properties = {'type': ['null', 'boolean', 'string']} |
89643ba6 JH |
95 | column_gs_type = 'boolValue' |
96 | elif column_effective_value_type == 'numberValue': | |
97 | if column_number_format_type == 'DATE_TIME': | |
5890b89c JH |
98 | col_properties = { |
99 | 'type': ['null', 'string'], | |
100 | 'format': 'date-time' | |
101 | } | |
89643ba6 JH |
102 | column_gs_type = 'numberType.DATE_TIME' |
103 | elif column_number_format_type == 'DATE': | |
5890b89c JH |
104 | col_properties = { |
105 | 'type': ['null', 'string'], | |
106 | 'format': 'date' | |
107 | } | |
89643ba6 JH |
108 | column_gs_type = 'numberType.DATE' |
109 | elif column_number_format_type == 'TIME': | |
5890b89c JH |
110 | col_properties = { |
111 | 'type': ['null', 'string'], | |
112 | 'format': 'time' | |
113 | } | |
89643ba6 JH |
114 | column_gs_type = 'numberType.TIME' |
115 | elif column_number_format_type == 'TEXT': | |
5890b89c | 116 | col_properties = {'type': ['null', 'string']} |
89643ba6 JH |
117 | column_gs_type = 'stringValue' |
118 | else: | |
5890b89c JH |
119 | # Interesting - order in the anyOf makes a difference. |
120 | # Number w/ multipleOf must be listed last, otherwise errors occur. | |
121 | col_properties = { | |
122 | 'anyOf': [ | |
123 | { | |
124 | 'type': 'string' | |
125 | }, | |
126 | { | |
127 | 'type': 'null' | |
128 | }, | |
129 | { | |
130 | 'type': 'number', | |
131 | 'multipleOf': 1e-15 | |
132 | } | |
133 | ] | |
134 | } | |
89643ba6 | 135 | column_gs_type = 'numberType' |
5890b89c JH |
136 | # Catch-all to deal with other types and set to string |
137 | # column_effective_value_type: formulaValue, errorValue, or other | |
138 | else: | |
139 | col_properties = {'type': ['null', 'string']} | |
140 | column_gs_type = 'unsupportedValue' | |
141 | LOGGER.info('Unsupported data type: {}, value: {}'.format(column_name, \ | |
99424fee | 142 | column_effective_value_type)) |
5890b89c | 143 | LOGGER.info('Converting to string.') |
89643ba6 JH |
144 | else: # skipped |
145 | column_is_skipped = True | |
146 | skipped = skipped + 1 | |
147 | column_index_str = str(column_index).zfill(2) | |
148 | column_name = '__sdc_skip_col_{}'.format(column_index_str) | |
5890b89c | 149 | col_properties = {'type': ['null', 'string']} |
89643ba6 JH |
150 | column_gs_type = 'stringValue' |
151 | ||
152 | if skipped >= 2: | |
153 | # skipped = 2 consecutive skipped headers | |
154 | # Remove prior_header column_name | |
155 | sheet_json_schema['properties'].pop(prior_header, None) | |
89643ba6 JH |
156 | break |
157 | ||
158 | else: | |
159 | column = {} | |
160 | column = { | |
161 | 'columnIndex': column_index, | |
162 | 'columnLetter': column_letter, | |
163 | 'columnName': column_name, | |
164 | 'columnType': column_gs_type, | |
165 | 'columnSkipped': column_is_skipped | |
166 | } | |
167 | columns.append(column) | |
168 | ||
5890b89c | 169 | sheet_json_schema['properties'][column_name] = col_properties |
89643ba6 JH |
170 | |
171 | prior_header = column_name | |
172 | i = i + 1 | |
173 | ||
174 | return sheet_json_schema, columns | |
175 | ||
176 | ||
5890b89c JH |
177 | # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query |
178 | # endpoint: spreadsheets/{spreadsheet_id} | |
179 | # params: includeGridData = true, ranges = '{sheet_title}'!1:2 | |
180 | # This endpoint includes detailed metadata about each cell - incl. data type, formatting, etc. | |
89643ba6 JH |
181 | def get_sheet_metadata(sheet, spreadsheet_id, client): |
182 | sheet_id = sheet.get('properties', {}).get('sheetId') | |
183 | sheet_title = sheet.get('properties', {}).get('title') | |
184 | LOGGER.info('sheet_id = {}, sheet_title = {}'.format(sheet_id, sheet_title)) | |
185 | ||
186 | stream_name = 'sheet_metadata' | |
187 | stream_metadata = STREAMS.get(stream_name) | |
188 | api = stream_metadata.get('api', 'sheets') | |
189 | params = stream_metadata.get('params', {}) | |
99424fee JH |
190 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \ |
191 | params.items()]).replace('{sheet_title}', sheet_title) | |
192 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ | |
193 | spreadsheet_id), querystring) | |
89643ba6 JH |
194 | |
195 | sheet_md_results = client.get(path=path, api=api, endpoint=stream_name) | |
5890b89c JH |
196 | # sheet_metadata: 1st `sheets` node in results |
197 | sheet_metadata = sheet_md_results.get('sheets')[0] | |
89643ba6 | 198 | |
5890b89c JH |
199 | # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results) |
200 | sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata) | |
201 | ||
202 | return sheet_json_schema, columns | |
89643ba6 JH |
203 | |
204 | ||
205 | def get_abs_path(path): | |
206 | return os.path.join(os.path.dirname(os.path.realpath(__file__)), path) | |
207 | ||
208 | def get_schemas(client, spreadsheet_id): | |
209 | schemas = {} | |
210 | field_metadata = {} | |
211 | ||
212 | for stream_name, stream_metadata in STREAMS.items(): | |
213 | schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) | |
214 | with open(schema_path) as file: | |
215 | schema = json.load(file) | |
216 | schemas[stream_name] = schema | |
217 | mdata = metadata.new() | |
218 | ||
219 | # Documentation: | |
220 | # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions | |
221 | # Reference: | |
222 | # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44 | |
223 | mdata = metadata.get_standard_metadata( | |
224 | schema=schema, | |
225 | key_properties=stream_metadata.get('key_properties', None), | |
226 | valid_replication_keys=stream_metadata.get('replication_keys', None), | |
227 | replication_method=stream_metadata.get('replication_method', None) | |
228 | ) | |
229 | field_metadata[stream_name] = mdata | |
99424fee | 230 | |
89643ba6 JH |
231 | if stream_name == 'spreadsheet_metadata': |
232 | api = stream_metadata.get('api', 'sheets') | |
233 | params = stream_metadata.get('params', {}) | |
234 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]) | |
99424fee JH |
235 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ |
236 | spreadsheet_id), querystring) | |
89643ba6 | 237 | |
5890b89c | 238 | # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet) |
99424fee JH |
239 | spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \ |
240 | endpoint=stream_name) | |
89643ba6 JH |
241 | |
242 | sheets = spreadsheet_md_results.get('sheets') | |
243 | if sheets: | |
5890b89c | 244 | # Loop thru each worksheet in spreadsheet |
89643ba6 | 245 | for sheet in sheets: |
5890b89c JH |
246 | # GET sheet_json_schema for each worksheet (from function above) |
247 | sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) | |
99424fee | 248 | LOGGER.info('columns = {}'.format(columns)) |
89643ba6 JH |
249 | |
250 | sheet_title = sheet.get('properties', {}).get('title') | |
5890b89c | 251 | schemas[sheet_title] = sheet_json_schema |
89643ba6 JH |
252 | sheet_mdata = metadata.new() |
253 | sheet_mdata = metadata.get_standard_metadata( | |
5890b89c | 254 | schema=sheet_json_schema, |
89643ba6 JH |
255 | key_properties=['__sdc_row'], |
256 | valid_replication_keys=None, | |
257 | replication_method='FULL_TABLE' | |
258 | ) | |
259 | field_metadata[sheet_title] = sheet_mdata | |
99424fee | 260 | |
89643ba6 | 261 | return schemas, field_metadata |