]>
Commit | Line | Data |
---|---|---|
89643ba6 JH |
1 | import os |
2 | import json | |
3 | from collections import OrderedDict | |
4 | import singer | |
5 | from singer import metadata | |
6 | from tap_google_sheets.streams import STREAMS | |
7 | ||
8 | LOGGER = singer.get_logger() | |
9 | ||
10 | # Reference: | |
11 | # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata | |
12 | ||
13 | # Convert column index to column letter | |
99424fee | 14 | def colnum_string(num): |
89643ba6 | 15 | string = "" |
99424fee JH |
16 | while num > 0: |
17 | num, remainder = divmod(num - 1, 26) | |
89643ba6 JH |
18 | string = chr(65 + remainder) + string |
19 | return string | |
20 | ||
21 | ||
22 | # Create sheet_metadata_json with columns from sheet | |
99424fee | 23 | def get_sheet_schema_columns(sheet): |
89643ba6 JH |
24 | sheet_json_schema = OrderedDict() |
25 | data = next(iter(sheet.get('data', [])), {}) | |
99424fee | 26 | row_data = data.get('rowData', []) |
89643ba6 JH |
27 | # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse |
28 | ||
29 | headers = row_data[0].get('values', []) | |
30 | first_values = row_data[1].get('values', []) | |
31 | # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) | |
32 | ||
33 | sheet_json_schema['type'] = 'object' | |
34 | sheet_json_schema['additionalProperties'] = False | |
35 | sheet_json_schema = { | |
36 | 'type': 'object', | |
37 | 'additionalProperties': False, | |
38 | 'properties': { | |
39 | '__sdc_spreadsheet_id': { | |
40 | 'type': ['null', 'string'] | |
41 | }, | |
42 | '__sdc_sheet_id': { | |
43 | 'type': ['null', 'integer'] | |
44 | }, | |
45 | '__sdc_row': { | |
46 | 'type': ['null', 'integer'] | |
47 | } | |
48 | } | |
49 | } | |
50 | ||
51 | header_list = [] # used for checking uniqueness | |
52 | columns = [] | |
53 | prior_header = None | |
54 | i = 0 | |
55 | skipped = 0 | |
56 | # Read column headers until end or 2 consecutive skipped headers | |
57 | for header in headers: | |
58 | # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True))) | |
59 | column_index = i + 1 | |
60 | column_letter = colnum_string(column_index) | |
61 | header_value = header.get('formattedValue') | |
62 | if header_value: # NOT skipped | |
63 | column_is_skipped = False | |
64 | skipped = 0 | |
65 | column_name = '{}'.format(header_value) | |
66 | if column_name in header_list: | |
67 | raise Exception('DUPLICATE HEADER ERROR: {}'.format(column_name)) | |
99424fee | 68 | header_list.append(column_name) |
89643ba6 JH |
69 | |
70 | first_value = first_values[i] | |
89643ba6 JH |
71 | |
72 | column_effective_value = first_value.get('effectiveValue', {}) | |
73 | for key in column_effective_value.keys(): | |
74 | if key in ('numberValue', 'stringValue', 'boolValue', 'errorType', 'formulaType'): | |
75 | column_effective_value_type = key | |
76 | ||
99424fee JH |
77 | column_number_format = first_values[i].get('effectiveFormat', {}).get( |
78 | 'numberFormat', {}) | |
89643ba6 JH |
79 | column_number_format_type = column_number_format.get('type') |
80 | ||
81 | # Determine datatype for sheet_json_schema | |
82 | # | |
99424fee JH |
83 | # column_effective_value_type = numberValue, stringValue, boolValue; |
84 | # INVALID: errorType, formulaType | |
85 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue | |
89643ba6 | 86 | # |
99424fee JH |
87 | # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, |
88 | # TIME, DATE_TIME, SCIENTIFIC | |
89 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType | |
89643ba6 JH |
90 | # |
91 | column_format = None # Default | |
92 | # column_multiple_of = None # Default | |
99424fee | 93 | if column_effective_value_type == 'stringValue': |
89643ba6 JH |
94 | column_type = ['null', 'string'] |
95 | column_gs_type = 'stringValue' | |
96 | elif column_effective_value_type == 'boolValue': | |
97 | column_type = ['null', 'boolean', 'string'] | |
98 | column_gs_type = 'boolValue' | |
99 | elif column_effective_value_type == 'numberValue': | |
100 | if column_number_format_type == 'DATE_TIME': | |
101 | column_type = ['null', 'string'] | |
102 | column_format = 'date-time' | |
103 | column_gs_type = 'numberType.DATE_TIME' | |
104 | elif column_number_format_type == 'DATE': | |
105 | column_type = ['null', 'string'] | |
106 | column_format = 'date' | |
107 | column_gs_type = 'numberType.DATE' | |
108 | elif column_number_format_type == 'TIME': | |
109 | column_type = ['null', 'string'] | |
110 | column_format = 'time' | |
111 | column_gs_type = 'numberType.TIME' | |
112 | elif column_number_format_type == 'TEXT': | |
113 | column_type = ['null', 'string'] | |
114 | column_gs_type = 'stringValue' | |
115 | else: | |
116 | column_type = ['null', 'number', 'string'] | |
117 | column_gs_type = 'numberType' | |
99424fee JH |
118 | elif column_effective_value_type in ('formulaValue', 'errorValue'): |
119 | raise Exception('INVALID DATA TYPE ERROR: {}, value: {}'.format(column_name, \ | |
120 | column_effective_value_type)) | |
89643ba6 JH |
121 | else: # skipped |
122 | column_is_skipped = True | |
123 | skipped = skipped + 1 | |
124 | column_index_str = str(column_index).zfill(2) | |
125 | column_name = '__sdc_skip_col_{}'.format(column_index_str) | |
126 | column_type = ['null', 'string'] | |
127 | column_format = None | |
128 | column_gs_type = 'stringValue' | |
129 | ||
130 | if skipped >= 2: | |
131 | # skipped = 2 consecutive skipped headers | |
132 | # Remove prior_header column_name | |
133 | sheet_json_schema['properties'].pop(prior_header, None) | |
89643ba6 JH |
134 | break |
135 | ||
136 | else: | |
137 | column = {} | |
138 | column = { | |
139 | 'columnIndex': column_index, | |
140 | 'columnLetter': column_letter, | |
141 | 'columnName': column_name, | |
142 | 'columnType': column_gs_type, | |
143 | 'columnSkipped': column_is_skipped | |
144 | } | |
145 | columns.append(column) | |
146 | ||
147 | sheet_json_schema['properties'][column_name] = column | |
148 | sheet_json_schema['properties'][column_name]['type'] = column_type | |
149 | if column_format: | |
150 | sheet_json_schema['properties'][column_name]['format'] = column_format | |
151 | ||
152 | prior_header = column_name | |
153 | i = i + 1 | |
154 | ||
155 | return sheet_json_schema, columns | |
156 | ||
157 | ||
158 | def get_sheet_metadata(sheet, spreadsheet_id, client): | |
159 | sheet_id = sheet.get('properties', {}).get('sheetId') | |
160 | sheet_title = sheet.get('properties', {}).get('title') | |
161 | LOGGER.info('sheet_id = {}, sheet_title = {}'.format(sheet_id, sheet_title)) | |
162 | ||
163 | stream_name = 'sheet_metadata' | |
164 | stream_metadata = STREAMS.get(stream_name) | |
165 | api = stream_metadata.get('api', 'sheets') | |
166 | params = stream_metadata.get('params', {}) | |
99424fee JH |
167 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \ |
168 | params.items()]).replace('{sheet_title}', sheet_title) | |
169 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ | |
170 | spreadsheet_id), querystring) | |
89643ba6 JH |
171 | |
172 | sheet_md_results = client.get(path=path, api=api, endpoint=stream_name) | |
173 | sheet_cols = sheet_md_results.get('sheets')[0] | |
99424fee | 174 | sheet_schema, columns = get_sheet_schema_columns(sheet_cols) |
89643ba6 JH |
175 | |
176 | return sheet_schema, columns | |
177 | ||
178 | ||
179 | def get_abs_path(path): | |
180 | return os.path.join(os.path.dirname(os.path.realpath(__file__)), path) | |
181 | ||
182 | def get_schemas(client, spreadsheet_id): | |
183 | schemas = {} | |
184 | field_metadata = {} | |
185 | ||
186 | for stream_name, stream_metadata in STREAMS.items(): | |
187 | schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) | |
188 | with open(schema_path) as file: | |
189 | schema = json.load(file) | |
190 | schemas[stream_name] = schema | |
191 | mdata = metadata.new() | |
192 | ||
193 | # Documentation: | |
194 | # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions | |
195 | # Reference: | |
196 | # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44 | |
197 | mdata = metadata.get_standard_metadata( | |
198 | schema=schema, | |
199 | key_properties=stream_metadata.get('key_properties', None), | |
200 | valid_replication_keys=stream_metadata.get('replication_keys', None), | |
201 | replication_method=stream_metadata.get('replication_method', None) | |
202 | ) | |
203 | field_metadata[stream_name] = mdata | |
99424fee | 204 | |
89643ba6 JH |
205 | if stream_name == 'spreadsheet_metadata': |
206 | api = stream_metadata.get('api', 'sheets') | |
207 | params = stream_metadata.get('params', {}) | |
208 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]) | |
99424fee JH |
209 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ |
210 | spreadsheet_id), querystring) | |
89643ba6 | 211 | |
99424fee JH |
212 | spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \ |
213 | endpoint=stream_name) | |
89643ba6 JH |
214 | |
215 | sheets = spreadsheet_md_results.get('sheets') | |
216 | if sheets: | |
217 | for sheet in sheets: | |
218 | sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) | |
99424fee | 219 | LOGGER.info('columns = {}'.format(columns)) |
89643ba6 JH |
220 | |
221 | sheet_title = sheet.get('properties', {}).get('title') | |
222 | schemas[sheet_title] = sheet_schema | |
223 | sheet_mdata = metadata.new() | |
224 | sheet_mdata = metadata.get_standard_metadata( | |
225 | schema=sheet_schema, | |
226 | key_properties=['__sdc_row'], | |
227 | valid_replication_keys=None, | |
228 | replication_method='FULL_TABLE' | |
229 | ) | |
230 | field_metadata[sheet_title] = sheet_mdata | |
99424fee | 231 | |
89643ba6 | 232 | return schemas, field_metadata |