]> git.immae.eu Git - github/fretlink/tap-google-sheets.git/blame - tap_google_sheets/schema.py
v.1.0.3 Fix slashes and discovery errors (#15)
[github/fretlink/tap-google-sheets.git] / tap_google_sheets / schema.py
CommitLineData
89643ba6
JH
1import os
2import json
5fc2ead5
JH
3import re
4import urllib.parse
89643ba6
JH
5from collections import OrderedDict
6import singer
7from singer import metadata
8from tap_google_sheets.streams import STREAMS
9
10LOGGER = singer.get_logger()
11
12# Reference:
13# https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata
14
15# Convert column index to column letter
99424fee 16def colnum_string(num):
89643ba6 17 string = ""
99424fee
JH
18 while num > 0:
19 num, remainder = divmod(num - 1, 26)
89643ba6
JH
20 string = chr(65 + remainder) + string
21 return string
22
23
24# Create sheet_metadata_json with columns from sheet
99424fee 25def get_sheet_schema_columns(sheet):
43a24cba 26 sheet_title = sheet.get('properties', {}).get('title')
89643ba6
JH
27 sheet_json_schema = OrderedDict()
28 data = next(iter(sheet.get('data', [])), {})
99424fee 29 row_data = data.get('rowData', [])
376f1145
JH
30 if row_data == []:
31 # Empty sheet, SKIP
32 LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title))
33 return None, None
f6d7c509 34 # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
35 headers = row_data[0].get('values', [])
36 first_values = row_data[1].get('values', [])
37 # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
376f1145 38
f6d7c509 39 sheet_json_schema = {
40 'type': 'object',
41 'additionalProperties': False,
42 'properties': {
43 '__sdc_spreadsheet_id': {
44 'type': ['null', 'string']
45 },
46 '__sdc_sheet_id': {
47 'type': ['null', 'integer']
48 },
49 '__sdc_row': {
50 'type': ['null', 'integer']
89643ba6
JH
51 }
52 }
f6d7c509 53 }
376f1145 54
f6d7c509 55 header_list = [] # used for checking uniqueness
56 columns = []
57 prior_header = None
58 i = 0
59 skipped = 0
60 # Read column headers until end or 2 consecutive skipped headers
61 for header in headers:
62 # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
63 column_index = i + 1
64 column_letter = colnum_string(column_index)
65 header_value = header.get('formattedValue')
66 if header_value: # NOT skipped
67 column_is_skipped = False
68 skipped = 0
69 column_name = '{}'.format(header_value)
70 if column_name in header_list:
71 raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
72 sheet_title, column_name, column_letter))
73 header_list.append(column_name)
376f1145 74
f6d7c509 75 first_value = None
76 try:
77 first_value = first_values[i]
78 except IndexError as err:
5fc2ead5 79 LOGGER.info('NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
f6d7c509 80 sheet_title, column_name, column_letter, err))
5fc2ead5
JH
81 first_value = {}
82 first_values.append(first_value)
83 pass
376f1145 84
f6d7c509 85 column_effective_value = first_value.get('effectiveValue', {})
376f1145 86
f6d7c509 87 col_val = None
88 if column_effective_value == {}:
89 column_effective_value_type = 'stringValue'
90 LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
91 sheet_title, column_name, column_letter))
92 LOGGER.info(' Setting column datatype to STRING')
93 else:
94 for key, val in column_effective_value.items():
95 if key in ('numberValue', 'stringValue', 'boolValue'):
96 column_effective_value_type = key
97 col_val = str(val)
98 elif key in ('errorType', 'formulaType'):
99 col_val = str(val)
100 raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
101 sheet_title, column_name, column_letter, key, col_val))
376f1145 102
f6d7c509 103 column_number_format = first_values[i].get('effectiveFormat', {}).get(
104 'numberFormat', {})
105 column_number_format_type = column_number_format.get('type')
106
107 # Determine datatype for sheet_json_schema
108 #
109 # column_effective_value_type = numberValue, stringValue, boolValue;
110 # INVALID: errorType, formulaType
111 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
112 #
113 # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
114 # TIME, DATE_TIME, SCIENTIFIC
115 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
116 #
117 column_format = None # Default
118 if column_effective_value == {}:
119 col_properties = {'type': ['null', 'string']}
120 column_gs_type = 'stringValue'
121 LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
122 sheet_title, column_name, column_letter))
123 LOGGER.info(' Setting column datatype to STRING')
124 elif column_effective_value_type == 'stringValue':
125 col_properties = {'type': ['null', 'string']}
126 column_gs_type = 'stringValue'
127 elif column_effective_value_type == 'boolValue':
128 col_properties = {'type': ['null', 'boolean', 'string']}
129 column_gs_type = 'boolValue'
130 elif column_effective_value_type == 'numberValue':
131 if column_number_format_type == 'DATE_TIME':
132 col_properties = {
133 'type': ['null', 'string'],
134 'format': 'date-time'
135 }
136 column_gs_type = 'numberType.DATE_TIME'
137 elif column_number_format_type == 'DATE':
138 col_properties = {
139 'type': ['null', 'string'],
140 'format': 'date'
141 }
142 column_gs_type = 'numberType.DATE'
143 elif column_number_format_type == 'TIME':
144 col_properties = {
145 'type': ['null', 'string'],
146 'format': 'time'
147 }
148 column_gs_type = 'numberType.TIME'
149 elif column_number_format_type == 'TEXT':
5890b89c 150 col_properties = {'type': ['null', 'string']}
89643ba6
JH
151 column_gs_type = 'stringValue'
152 else:
f6d7c509 153 # Interesting - order in the anyOf makes a difference.
154 # Number w/ multipleOf must be listed last, otherwise errors occur.
155 col_properties = {
156 'anyOf': [
157 {
158 'type': 'null'
159 },
160 {
161 'type': 'number',
162 'multipleOf': 1e-15
163 },
164 {
165 'type': 'string'
166 }
167 ]
168 }
169 column_gs_type = 'numberType'
170 # Catch-all to deal with other types and set to string
171 # column_effective_value_type: formulaValue, errorValue, or other
172 else:
5890b89c 173 col_properties = {'type': ['null', 'string']}
f6d7c509 174 column_gs_type = 'unsupportedValue'
175 LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
176 sheet_title, column_name, column_letter, column_effective_value_type, col_val))
177 LOGGER.info('Converting to string.')
178 else: # skipped
179 column_is_skipped = True
180 skipped = skipped + 1
181 column_index_str = str(column_index).zfill(2)
182 column_name = '__sdc_skip_col_{}'.format(column_index_str)
183 col_properties = {'type': ['null', 'string']}
184 column_gs_type = 'stringValue'
185 LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
186 sheet_title, column_name, column_letter))
187 LOGGER.info(' This column will be skipped during data loading.')
89643ba6 188
f6d7c509 189 if skipped >= 2:
190 # skipped = 2 consecutive skipped headers
191 # Remove prior_header column_name
192 sheet_json_schema['properties'].pop(prior_header, None)
193 LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
194 sheet_title, column_name, column_letter))
195 break
89643ba6 196
f6d7c509 197 else:
198 column = {}
199 column = {
200 'columnIndex': column_index,
201 'columnLetter': column_letter,
202 'columnName': column_name,
203 'columnType': column_gs_type,
204 'columnSkipped': column_is_skipped
205 }
206 columns.append(column)
89643ba6 207
f6d7c509 208 sheet_json_schema['properties'][column_name] = col_properties
376f1145 209
f6d7c509 210 prior_header = column_name
211 i = i + 1
376f1145 212
f6d7c509 213 return sheet_json_schema, columns
89643ba6
JH
214
215
5890b89c
JH
216# Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query
217# endpoint: spreadsheets/{spreadsheet_id}
218# params: includeGridData = true, ranges = '{sheet_title}'!1:2
219# This endpoint includes detailed metadata about each cell - incl. data type, formatting, etc.
89643ba6
JH
220def get_sheet_metadata(sheet, spreadsheet_id, client):
221 sheet_id = sheet.get('properties', {}).get('sheetId')
222 sheet_title = sheet.get('properties', {}).get('title')
223 LOGGER.info('sheet_id = {}, sheet_title = {}'.format(sheet_id, sheet_title))
224
225 stream_name = 'sheet_metadata'
226 stream_metadata = STREAMS.get(stream_name)
227 api = stream_metadata.get('api', 'sheets')
228 params = stream_metadata.get('params', {})
5fc2ead5
JH
229 sheet_title_encoded = urllib.parse.quote_plus(sheet_title)
230 sheet_title_escaped = re.escape(sheet_title)
99424fee 231 querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \
5fc2ead5 232 params.items()]).replace('{sheet_title}', sheet_title_encoded)
99424fee
JH
233 path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
234 spreadsheet_id), querystring)
89643ba6 235
5fc2ead5 236 sheet_md_results = client.get(path=path, api=api, endpoint=sheet_title_escaped)
5890b89c
JH
237 # sheet_metadata: 1st `sheets` node in results
238 sheet_metadata = sheet_md_results.get('sheets')[0]
89643ba6 239
5890b89c 240 # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results)
f6d7c509 241 try:
242 sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata)
5fc2ead5
JH
243 except Exception as err:
244 LOGGER.warning('{}'.format(err))
245 LOGGER.warning('SKIPPING Malformed sheet: {}'.format(sheet_title))
f6d7c509 246 sheet_json_schema, columns = None, None
5890b89c
JH
247
248 return sheet_json_schema, columns
89643ba6
JH
249
250
251def get_abs_path(path):
252 return os.path.join(os.path.dirname(os.path.realpath(__file__)), path)
253
254def get_schemas(client, spreadsheet_id):
255 schemas = {}
256 field_metadata = {}
257
258 for stream_name, stream_metadata in STREAMS.items():
259 schema_path = get_abs_path('schemas/{}.json'.format(stream_name))
260 with open(schema_path) as file:
261 schema = json.load(file)
262 schemas[stream_name] = schema
263 mdata = metadata.new()
264
265 # Documentation:
266 # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions
267 # Reference:
268 # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44
269 mdata = metadata.get_standard_metadata(
270 schema=schema,
271 key_properties=stream_metadata.get('key_properties', None),
272 valid_replication_keys=stream_metadata.get('replication_keys', None),
273 replication_method=stream_metadata.get('replication_method', None)
274 )
275 field_metadata[stream_name] = mdata
99424fee 276
89643ba6
JH
277 if stream_name == 'spreadsheet_metadata':
278 api = stream_metadata.get('api', 'sheets')
279 params = stream_metadata.get('params', {})
280 querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()])
99424fee
JH
281 path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
282 spreadsheet_id), querystring)
89643ba6 283
5890b89c 284 # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet)
99424fee
JH
285 spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \
286 endpoint=stream_name)
89643ba6
JH
287
288 sheets = spreadsheet_md_results.get('sheets')
289 if sheets:
5890b89c 290 # Loop thru each worksheet in spreadsheet
89643ba6 291 for sheet in sheets:
5890b89c
JH
292 # GET sheet_json_schema for each worksheet (from function above)
293 sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
376f1145
JH
294
295 # SKIP empty sheets (where sheet_json_schema and columns are None)
296 if sheet_json_schema and columns:
297 sheet_title = sheet.get('properties', {}).get('title')
298 schemas[sheet_title] = sheet_json_schema
299 sheet_mdata = metadata.new()
300 sheet_mdata = metadata.get_standard_metadata(
301 schema=sheet_json_schema,
302 key_properties=['__sdc_row'],
303 valid_replication_keys=None,
304 replication_method='FULL_TABLE'
305 )
306 field_metadata[sheet_title] = sheet_mdata
99424fee 307
89643ba6 308 return schemas, field_metadata