]> git.immae.eu Git - github/fretlink/tap-google-sheets.git/blob - tap_google_sheets/schema.py
3e63e7597409859e323eab11dfdb0d1ee3b58a45
[github/fretlink/tap-google-sheets.git] / tap_google_sheets / schema.py
1 import os
2 import json
3 from collections import OrderedDict
4 import singer
5 from singer import metadata
6 from tap_google_sheets.streams import STREAMS
7
8 LOGGER = singer.get_logger()
9
10 # Reference:
11 # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata
12
13 # Convert column index to column letter
14 def colnum_string(num):
15 string = ""
16 while num > 0:
17 num, remainder = divmod(num - 1, 26)
18 string = chr(65 + remainder) + string
19 return string
20
21
22 # Create sheet_metadata_json with columns from sheet
23 def get_sheet_schema_columns(sheet):
24 sheet_title = sheet.get('properties', {}).get('title')
25 sheet_json_schema = OrderedDict()
26 data = next(iter(sheet.get('data', [])), {})
27 row_data = data.get('rowData', [])
28 if row_data == []:
29 # Empty sheet, SKIP
30 LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title))
31 return None, None
32 # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
33 headers = row_data[0].get('values', [])
34 first_values = row_data[1].get('values', [])
35 # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
36
37 sheet_json_schema = {
38 'type': 'object',
39 'additionalProperties': False,
40 'properties': {
41 '__sdc_spreadsheet_id': {
42 'type': ['null', 'string']
43 },
44 '__sdc_sheet_id': {
45 'type': ['null', 'integer']
46 },
47 '__sdc_row': {
48 'type': ['null', 'integer']
49 }
50 }
51 }
52
53 header_list = [] # used for checking uniqueness
54 columns = []
55 prior_header = None
56 i = 0
57 skipped = 0
58 # Read column headers until end or 2 consecutive skipped headers
59 for header in headers:
60 # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
61 column_index = i + 1
62 column_letter = colnum_string(column_index)
63 header_value = header.get('formattedValue')
64 if header_value: # NOT skipped
65 column_is_skipped = False
66 skipped = 0
67 column_name = '{}'.format(header_value)
68 if column_name in header_list:
69 raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
70 sheet_title, column_name, column_letter))
71 header_list.append(column_name)
72
73 first_value = None
74 try:
75 first_value = first_values[i]
76 except IndexError as err:
77 raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
78 sheet_title, column_name, column_letter, err))
79
80 column_effective_value = first_value.get('effectiveValue', {})
81
82 col_val = None
83 if column_effective_value == {}:
84 column_effective_value_type = 'stringValue'
85 LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
86 sheet_title, column_name, column_letter))
87 LOGGER.info(' Setting column datatype to STRING')
88 else:
89 for key, val in column_effective_value.items():
90 if key in ('numberValue', 'stringValue', 'boolValue'):
91 column_effective_value_type = key
92 col_val = str(val)
93 elif key in ('errorType', 'formulaType'):
94 col_val = str(val)
95 raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
96 sheet_title, column_name, column_letter, key, col_val))
97
98 column_number_format = first_values[i].get('effectiveFormat', {}).get(
99 'numberFormat', {})
100 column_number_format_type = column_number_format.get('type')
101
102 # Determine datatype for sheet_json_schema
103 #
104 # column_effective_value_type = numberValue, stringValue, boolValue;
105 # INVALID: errorType, formulaType
106 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
107 #
108 # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
109 # TIME, DATE_TIME, SCIENTIFIC
110 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
111 #
112 column_format = None # Default
113 if column_effective_value == {}:
114 col_properties = {'type': ['null', 'string']}
115 column_gs_type = 'stringValue'
116 LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
117 sheet_title, column_name, column_letter))
118 LOGGER.info(' Setting column datatype to STRING')
119 elif column_effective_value_type == 'stringValue':
120 col_properties = {'type': ['null', 'string']}
121 column_gs_type = 'stringValue'
122 elif column_effective_value_type == 'boolValue':
123 col_properties = {'type': ['null', 'boolean', 'string']}
124 column_gs_type = 'boolValue'
125 elif column_effective_value_type == 'numberValue':
126 if column_number_format_type == 'DATE_TIME':
127 col_properties = {
128 'type': ['null', 'string'],
129 'format': 'date-time'
130 }
131 column_gs_type = 'numberType.DATE_TIME'
132 elif column_number_format_type == 'DATE':
133 col_properties = {
134 'type': ['null', 'string'],
135 'format': 'date'
136 }
137 column_gs_type = 'numberType.DATE'
138 elif column_number_format_type == 'TIME':
139 col_properties = {
140 'type': ['null', 'string'],
141 'format': 'time'
142 }
143 column_gs_type = 'numberType.TIME'
144 elif column_number_format_type == 'TEXT':
145 col_properties = {'type': ['null', 'string']}
146 column_gs_type = 'stringValue'
147 else:
148 # Interesting - order in the anyOf makes a difference.
149 # Number w/ multipleOf must be listed last, otherwise errors occur.
150 col_properties = {
151 'anyOf': [
152 {
153 'type': 'null'
154 },
155 {
156 'type': 'number',
157 'multipleOf': 1e-15
158 },
159 {
160 'type': 'string'
161 }
162 ]
163 }
164 column_gs_type = 'numberType'
165 # Catch-all to deal with other types and set to string
166 # column_effective_value_type: formulaValue, errorValue, or other
167 else:
168 col_properties = {'type': ['null', 'string']}
169 column_gs_type = 'unsupportedValue'
170 LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
171 sheet_title, column_name, column_letter, column_effective_value_type, col_val))
172 LOGGER.info('Converting to string.')
173 else: # skipped
174 column_is_skipped = True
175 skipped = skipped + 1
176 column_index_str = str(column_index).zfill(2)
177 column_name = '__sdc_skip_col_{}'.format(column_index_str)
178 col_properties = {'type': ['null', 'string']}
179 column_gs_type = 'stringValue'
180 LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
181 sheet_title, column_name, column_letter))
182 LOGGER.info(' This column will be skipped during data loading.')
183
184 if skipped >= 2:
185 # skipped = 2 consecutive skipped headers
186 # Remove prior_header column_name
187 sheet_json_schema['properties'].pop(prior_header, None)
188 LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
189 sheet_title, column_name, column_letter))
190 break
191
192 else:
193 column = {}
194 column = {
195 'columnIndex': column_index,
196 'columnLetter': column_letter,
197 'columnName': column_name,
198 'columnType': column_gs_type,
199 'columnSkipped': column_is_skipped
200 }
201 columns.append(column)
202
203 sheet_json_schema['properties'][column_name] = col_properties
204
205 prior_header = column_name
206 i = i + 1
207
208 return sheet_json_schema, columns
209
210
211 # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query
212 # endpoint: spreadsheets/{spreadsheet_id}
213 # params: includeGridData = true, ranges = '{sheet_title}'!1:2
214 # This endpoint includes detailed metadata about each cell - incl. data type, formatting, etc.
215 def get_sheet_metadata(sheet, spreadsheet_id, client):
216 sheet_id = sheet.get('properties', {}).get('sheetId')
217 sheet_title = sheet.get('properties', {}).get('title')
218 LOGGER.info('sheet_id = {}, sheet_title = {}'.format(sheet_id, sheet_title))
219
220 stream_name = 'sheet_metadata'
221 stream_metadata = STREAMS.get(stream_name)
222 api = stream_metadata.get('api', 'sheets')
223 params = stream_metadata.get('params', {})
224 querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \
225 params.items()]).replace('{sheet_title}', sheet_title)
226 path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
227 spreadsheet_id), querystring)
228
229 sheet_md_results = client.get(path=path, api=api, endpoint=stream_name)
230 # sheet_metadata: 1st `sheets` node in results
231 sheet_metadata = sheet_md_results.get('sheets')[0]
232
233 # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results)
234 try:
235 sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata)
236 except:
237 LOGGER.info('SKIPPING Malformed sheet: {}'.format(sheet_title))
238 sheet_json_schema, columns = None, None
239
240 return sheet_json_schema, columns
241
242
243 def get_abs_path(path):
244 return os.path.join(os.path.dirname(os.path.realpath(__file__)), path)
245
246 def get_schemas(client, spreadsheet_id):
247 schemas = {}
248 field_metadata = {}
249
250 for stream_name, stream_metadata in STREAMS.items():
251 schema_path = get_abs_path('schemas/{}.json'.format(stream_name))
252 with open(schema_path) as file:
253 schema = json.load(file)
254 schemas[stream_name] = schema
255 mdata = metadata.new()
256
257 # Documentation:
258 # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions
259 # Reference:
260 # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44
261 mdata = metadata.get_standard_metadata(
262 schema=schema,
263 key_properties=stream_metadata.get('key_properties', None),
264 valid_replication_keys=stream_metadata.get('replication_keys', None),
265 replication_method=stream_metadata.get('replication_method', None)
266 )
267 field_metadata[stream_name] = mdata
268
269 if stream_name == 'spreadsheet_metadata':
270 api = stream_metadata.get('api', 'sheets')
271 params = stream_metadata.get('params', {})
272 querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()])
273 path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
274 spreadsheet_id), querystring)
275
276 # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet)
277 spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \
278 endpoint=stream_name)
279
280 sheets = spreadsheet_md_results.get('sheets')
281 if sheets:
282 # Loop thru each worksheet in spreadsheet
283 for sheet in sheets:
284 # GET sheet_json_schema for each worksheet (from function above)
285 sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
286
287 # SKIP empty sheets (where sheet_json_schema and columns are None)
288 if sheet_json_schema and columns:
289 sheet_title = sheet.get('properties', {}).get('title')
290 schemas[sheet_title] = sheet_json_schema
291 sheet_mdata = metadata.new()
292 sheet_mdata = metadata.get_standard_metadata(
293 schema=sheet_json_schema,
294 key_properties=['__sdc_row'],
295 valid_replication_keys=None,
296 replication_method='FULL_TABLE'
297 )
298 field_metadata[sheet_title] = sheet_mdata
299
300 return schemas, field_metadata