diff options
author | Jeff Huth <jeff.huth@bytecode.io> | 2019-11-13 17:03:56 -0800 |
---|---|---|
committer | Jeff Huth <jeff.huth@bytecode.io> | 2019-11-13 17:03:56 -0800 |
commit | 89643ba6fa98db82efd3246805ef801a8bfb5c81 (patch) | |
tree | 739027b4e827def2db81631c9d6ed58ec2b97809 /tap_google_sheets/schema.py | |
parent | 5f8005471d3affaaf23489df93a58ca64c3da3ca (diff) | |
download | tap-google-sheets-89643ba6fa98db82efd3246805ef801a8bfb5c81.tar.gz tap-google-sheets-89643ba6fa98db82efd3246805ef801a8bfb5c81.tar.zst tap-google-sheets-89643ba6fa98db82efd3246805ef801a8bfb5c81.zip |
Initial commit
Discovery mode works. Still working on normal sync.
Diffstat (limited to 'tap_google_sheets/schema.py')
-rw-r--r-- | tap_google_sheets/schema.py | 228 |
1 files changed, 228 insertions, 0 deletions
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py new file mode 100644 index 0000000..237ab06 --- /dev/null +++ b/tap_google_sheets/schema.py | |||
@@ -0,0 +1,228 @@ | |||
1 | import os | ||
2 | import json | ||
3 | from collections import OrderedDict | ||
4 | import singer | ||
5 | from singer import metadata | ||
6 | from tap_google_sheets.streams import STREAMS | ||
7 | |||
8 | LOGGER = singer.get_logger() | ||
9 | |||
10 | # Reference: | ||
11 | # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata | ||
12 | |||
13 | # Convert column index to column letter | ||
14 | def colnum_string(n): | ||
15 | string = "" | ||
16 | while n > 0: | ||
17 | n, remainder = divmod(n - 1, 26) | ||
18 | string = chr(65 + remainder) + string | ||
19 | return string | ||
20 | |||
21 | |||
22 | # Create sheet_metadata_json with columns from sheet | ||
23 | def get_sheet_schema_columns(sheet, spreadsheet_id, client): | ||
24 | sheet_json_schema = OrderedDict() | ||
25 | data = next(iter(sheet.get('data', [])), {}) | ||
26 | row_data = data.get('rowData',[]) | ||
27 | # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse | ||
28 | |||
29 | headers = row_data[0].get('values', []) | ||
30 | first_values = row_data[1].get('values', []) | ||
31 | # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) | ||
32 | |||
33 | sheet_json_schema['type'] = 'object' | ||
34 | sheet_json_schema['additionalProperties'] = False | ||
35 | sheet_json_schema = { | ||
36 | 'type': 'object', | ||
37 | 'additionalProperties': False, | ||
38 | 'properties': { | ||
39 | '__sdc_spreadsheet_id': { | ||
40 | 'type': ['null', 'string'] | ||
41 | }, | ||
42 | '__sdc_sheet_id': { | ||
43 | 'type': ['null', 'integer'] | ||
44 | }, | ||
45 | '__sdc_row': { | ||
46 | 'type': ['null', 'integer'] | ||
47 | } | ||
48 | } | ||
49 | } | ||
50 | |||
51 | header_list = [] # used for checking uniqueness | ||
52 | columns = [] | ||
53 | prior_header = None | ||
54 | i = 0 | ||
55 | skipped = 0 | ||
56 | # Read column headers until end or 2 consecutive skipped headers | ||
57 | for header in headers: | ||
58 | # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True))) | ||
59 | column_index = i + 1 | ||
60 | column_letter = colnum_string(column_index) | ||
61 | header_value = header.get('formattedValue') | ||
62 | if header_value: # NOT skipped | ||
63 | column_is_skipped = False | ||
64 | skipped = 0 | ||
65 | column_name = '{}'.format(header_value) | ||
66 | if column_name in header_list: | ||
67 | raise Exception('DUPLICATE HEADER ERROR: {}'.format(column_name)) | ||
68 | else: | ||
69 | header_list.append(column_name) | ||
70 | |||
71 | first_value = first_values[i] | ||
72 | # LOGGER.info('first_value[{}] = {}'.format(i, json.dumps(first_value, indent=2, sort_keys=True))) | ||
73 | |||
74 | column_effective_value = first_value.get('effectiveValue', {}) | ||
75 | for key in column_effective_value.keys(): | ||
76 | if key in ('numberValue', 'stringValue', 'boolValue', 'errorType', 'formulaType'): | ||
77 | column_effective_value_type = key | ||
78 | |||
79 | column_number_format = first_values[i].get('effectiveFormat', {}).get('numberFormat', {}) | ||
80 | column_number_format_type = column_number_format.get('type') | ||
81 | |||
82 | # Determine datatype for sheet_json_schema | ||
83 | # | ||
84 | # column_effective_value_type = numberValue, stringValue, boolValue; INVALID: errorType, formulaType | ||
85 | # Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue | ||
86 | # | ||
87 | # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, TIME, DATE_TIME, SCIENTIFIC | ||
88 | # Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType | ||
89 | # | ||
90 | column_format = None # Default | ||
91 | # column_multiple_of = None # Default | ||
92 | if column_effective_value_type in ('formulaValue', 'errorValue'): | ||
93 | raise Exception('INVALID DATA TYPE ERROR: {}, value: {}'.format(column_name)) | ||
94 | elif column_effective_value_type == 'stringValue': | ||
95 | column_type = ['null', 'string'] | ||
96 | column_gs_type = 'stringValue' | ||
97 | elif column_effective_value_type == 'boolValue': | ||
98 | column_type = ['null', 'boolean', 'string'] | ||
99 | column_gs_type = 'boolValue' | ||
100 | elif column_effective_value_type == 'numberValue': | ||
101 | if column_number_format_type == 'DATE_TIME': | ||
102 | column_type = ['null', 'string'] | ||
103 | column_format = 'date-time' | ||
104 | column_gs_type = 'numberType.DATE_TIME' | ||
105 | elif column_number_format_type == 'DATE': | ||
106 | column_type = ['null', 'string'] | ||
107 | column_format = 'date' | ||
108 | column_gs_type = 'numberType.DATE' | ||
109 | elif column_number_format_type == 'TIME': | ||
110 | column_type = ['null', 'string'] | ||
111 | column_format = 'time' | ||
112 | column_gs_type = 'numberType.TIME' | ||
113 | elif column_number_format_type == 'TEXT': | ||
114 | column_type = ['null', 'string'] | ||
115 | column_gs_type = 'stringValue' | ||
116 | else: | ||
117 | column_type = ['null', 'number', 'string'] | ||
118 | column_gs_type = 'numberType' | ||
119 | |||
120 | else: # skipped | ||
121 | column_is_skipped = True | ||
122 | skipped = skipped + 1 | ||
123 | column_index_str = str(column_index).zfill(2) | ||
124 | column_name = '__sdc_skip_col_{}'.format(column_index_str) | ||
125 | column_type = ['null', 'string'] | ||
126 | column_format = None | ||
127 | column_gs_type = 'stringValue' | ||
128 | |||
129 | if skipped >= 2: | ||
130 | # skipped = 2 consecutive skipped headers | ||
131 | # Remove prior_header column_name | ||
132 | sheet_json_schema['properties'].pop(prior_header, None) | ||
133 | column_count = i - 1 | ||
134 | break | ||
135 | |||
136 | else: | ||
137 | column = {} | ||
138 | column = { | ||
139 | 'columnIndex': column_index, | ||
140 | 'columnLetter': column_letter, | ||
141 | 'columnName': column_name, | ||
142 | 'columnType': column_gs_type, | ||
143 | 'columnSkipped': column_is_skipped | ||
144 | } | ||
145 | columns.append(column) | ||
146 | |||
147 | sheet_json_schema['properties'][column_name] = column | ||
148 | sheet_json_schema['properties'][column_name]['type'] = column_type | ||
149 | if column_format: | ||
150 | sheet_json_schema['properties'][column_name]['format'] = column_format | ||
151 | |||
152 | prior_header = column_name | ||
153 | i = i + 1 | ||
154 | |||
155 | return sheet_json_schema, columns | ||
156 | |||
157 | |||
158 | def get_sheet_metadata(sheet, spreadsheet_id, client): | ||
159 | sheet_id = sheet.get('properties', {}).get('sheetId') | ||
160 | sheet_title = sheet.get('properties', {}).get('title') | ||
161 | LOGGER.info('sheet_id = {}, sheet_title = {}'.format(sheet_id, sheet_title)) | ||
162 | |||
163 | stream_name = 'sheet_metadata' | ||
164 | stream_metadata = STREAMS.get(stream_name) | ||
165 | api = stream_metadata.get('api', 'sheets') | ||
166 | params = stream_metadata.get('params', {}) | ||
167 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]).replace('{sheet_title}', sheet_title) | ||
168 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', spreadsheet_id), querystring) | ||
169 | |||
170 | sheet_md_results = client.get(path=path, api=api, endpoint=stream_name) | ||
171 | sheet_cols = sheet_md_results.get('sheets')[0] | ||
172 | sheet_schema, columns = get_sheet_schema_columns(sheet_cols, spreadsheet_id, client) | ||
173 | |||
174 | return sheet_schema, columns | ||
175 | |||
176 | |||
177 | def get_abs_path(path): | ||
178 | return os.path.join(os.path.dirname(os.path.realpath(__file__)), path) | ||
179 | |||
180 | def get_schemas(client, spreadsheet_id): | ||
181 | schemas = {} | ||
182 | field_metadata = {} | ||
183 | |||
184 | for stream_name, stream_metadata in STREAMS.items(): | ||
185 | schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) | ||
186 | with open(schema_path) as file: | ||
187 | schema = json.load(file) | ||
188 | schemas[stream_name] = schema | ||
189 | mdata = metadata.new() | ||
190 | |||
191 | # Documentation: | ||
192 | # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions | ||
193 | # Reference: | ||
194 | # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44 | ||
195 | mdata = metadata.get_standard_metadata( | ||
196 | schema=schema, | ||
197 | key_properties=stream_metadata.get('key_properties', None), | ||
198 | valid_replication_keys=stream_metadata.get('replication_keys', None), | ||
199 | replication_method=stream_metadata.get('replication_method', None) | ||
200 | ) | ||
201 | field_metadata[stream_name] = mdata | ||
202 | |||
203 | if stream_name == 'spreadsheet_metadata': | ||
204 | api = stream_metadata.get('api', 'sheets') | ||
205 | params = stream_metadata.get('params', {}) | ||
206 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]) | ||
207 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', spreadsheet_id), querystring) | ||
208 | |||
209 | spreadsheet_md_results = client.get(path=path, params=querystring, api=api, endpoint=stream_name) | ||
210 | |||
211 | sheets = spreadsheet_md_results.get('sheets') | ||
212 | if sheets: | ||
213 | for sheet in sheets: | ||
214 | sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) | ||
215 | # LOGGER.info('sheet_schema = {}'.format(json.dumps(sheet_schema, indent=2, sort_keys=True))) | ||
216 | |||
217 | sheet_title = sheet.get('properties', {}).get('title') | ||
218 | schemas[sheet_title] = sheet_schema | ||
219 | sheet_mdata = metadata.new() | ||
220 | sheet_mdata = metadata.get_standard_metadata( | ||
221 | schema=sheet_schema, | ||
222 | key_properties=['__sdc_row'], | ||
223 | valid_replication_keys=None, | ||
224 | replication_method='FULL_TABLE' | ||
225 | ) | ||
226 | field_metadata[sheet_title] = sheet_mdata | ||
227 | |||
228 | return schemas, field_metadata | ||