]> git.immae.eu Git - github/fretlink/tap-google-sheets.git/blob - tap_google_sheets/schema.py
243467b6c71c7e78efa224eed66c8967664dd2c8
[github/fretlink/tap-google-sheets.git] / tap_google_sheets / schema.py
1 import os
2 import json
3 from collections import OrderedDict
4 import singer
5 from singer import metadata
6 from tap_google_sheets.streams import STREAMS
7
8 LOGGER = singer.get_logger()
9
10 # Reference:
11 # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata
12
13 # Convert column index to column letter
14 def colnum_string(num):
15 string = ""
16 while num > 0:
17 num, remainder = divmod(num - 1, 26)
18 string = chr(65 + remainder) + string
19 return string
20
21
22 # Create sheet_metadata_json with columns from sheet
23 def get_sheet_schema_columns(sheet):
24 sheet_json_schema = OrderedDict()
25 data = next(iter(sheet.get('data', [])), {})
26 row_data = data.get('rowData', [])
27 # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
28
29 headers = row_data[0].get('values', [])
30 first_values = row_data[1].get('values', [])
31 # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
32
33 sheet_json_schema = {
34 'type': 'object',
35 'additionalProperties': False,
36 'properties': {
37 '__sdc_spreadsheet_id': {
38 'type': ['null', 'string']
39 },
40 '__sdc_sheet_id': {
41 'type': ['null', 'integer']
42 },
43 '__sdc_row': {
44 'type': ['null', 'integer']
45 }
46 }
47 }
48
49 header_list = [] # used for checking uniqueness
50 columns = []
51 prior_header = None
52 i = 0
53 skipped = 0
54 # Read column headers until end or 2 consecutive skipped headers
55 for header in headers:
56 # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
57 column_index = i + 1
58 column_letter = colnum_string(column_index)
59 header_value = header.get('formattedValue')
60 if header_value: # NOT skipped
61 column_is_skipped = False
62 skipped = 0
63 column_name = '{}'.format(header_value)
64 if column_name in header_list:
65 raise Exception('DUPLICATE HEADER ERROR: {}'.format(column_name))
66 header_list.append(column_name)
67
68 first_value = first_values[i]
69
70 column_effective_value = first_value.get('effectiveValue', {})
71 for key in column_effective_value.keys():
72 if key in ('numberValue', 'stringValue', 'boolValue', 'errorType', 'formulaType'):
73 column_effective_value_type = key
74
75 column_number_format = first_values[i].get('effectiveFormat', {}).get(
76 'numberFormat', {})
77 column_number_format_type = column_number_format.get('type')
78
79 # Determine datatype for sheet_json_schema
80 #
81 # column_effective_value_type = numberValue, stringValue, boolValue;
82 # INVALID: errorType, formulaType
83 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
84 #
85 # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
86 # TIME, DATE_TIME, SCIENTIFIC
87 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
88 #
89 column_format = None # Default
90 if column_effective_value_type == 'stringValue':
91 col_properties = {'type': ['null', 'string']}
92 column_gs_type = 'stringValue'
93 elif column_effective_value_type == 'boolValue':
94 col_properties = {'type': ['null', 'boolean', 'string']}
95 column_gs_type = 'boolValue'
96 elif column_effective_value_type == 'numberValue':
97 if column_number_format_type == 'DATE_TIME':
98 col_properties = {
99 'type': ['null', 'string'],
100 'format': 'date-time'
101 }
102 column_gs_type = 'numberType.DATE_TIME'
103 elif column_number_format_type == 'DATE':
104 col_properties = {
105 'type': ['null', 'string'],
106 'format': 'date'
107 }
108 column_gs_type = 'numberType.DATE'
109 elif column_number_format_type == 'TIME':
110 col_properties = {
111 'type': ['null', 'string'],
112 'format': 'time'
113 }
114 column_gs_type = 'numberType.TIME'
115 elif column_number_format_type == 'TEXT':
116 col_properties = {'type': ['null', 'string']}
117 column_gs_type = 'stringValue'
118 else:
119 # Interesting - order in the anyOf makes a difference.
120 # Number w/ multipleOf must be listed last, otherwise errors occur.
121 col_properties = {
122 'anyOf': [
123 {
124 'type': 'string'
125 },
126 {
127 'type': 'null'
128 },
129 {
130 'type': 'number',
131 'multipleOf': 1e-15
132 }
133 ]
134 }
135 column_gs_type = 'numberType'
136 # Catch-all to deal with other types and set to string
137 # column_effective_value_type: formulaValue, errorValue, or other
138 else:
139 col_properties = {'type': ['null', 'string']}
140 column_gs_type = 'unsupportedValue'
141 LOGGER.info('Unsupported data type: {}, value: {}'.format(column_name, \
142 column_effective_value_type))
143 LOGGER.info('Converting to string.')
144 else: # skipped
145 column_is_skipped = True
146 skipped = skipped + 1
147 column_index_str = str(column_index).zfill(2)
148 column_name = '__sdc_skip_col_{}'.format(column_index_str)
149 col_properties = {'type': ['null', 'string']}
150 column_gs_type = 'stringValue'
151
152 if skipped >= 2:
153 # skipped = 2 consecutive skipped headers
154 # Remove prior_header column_name
155 sheet_json_schema['properties'].pop(prior_header, None)
156 break
157
158 else:
159 column = {}
160 column = {
161 'columnIndex': column_index,
162 'columnLetter': column_letter,
163 'columnName': column_name,
164 'columnType': column_gs_type,
165 'columnSkipped': column_is_skipped
166 }
167 columns.append(column)
168
169 sheet_json_schema['properties'][column_name] = col_properties
170
171 prior_header = column_name
172 i = i + 1
173
174 return sheet_json_schema, columns
175
176
177 # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query
178 # endpoint: spreadsheets/{spreadsheet_id}
179 # params: includeGridData = true, ranges = '{sheet_title}'!1:2
180 # This endpoint includes detailed metadata about each cell - incl. data type, formatting, etc.
181 def get_sheet_metadata(sheet, spreadsheet_id, client):
182 sheet_id = sheet.get('properties', {}).get('sheetId')
183 sheet_title = sheet.get('properties', {}).get('title')
184 LOGGER.info('sheet_id = {}, sheet_title = {}'.format(sheet_id, sheet_title))
185
186 stream_name = 'sheet_metadata'
187 stream_metadata = STREAMS.get(stream_name)
188 api = stream_metadata.get('api', 'sheets')
189 params = stream_metadata.get('params', {})
190 querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \
191 params.items()]).replace('{sheet_title}', sheet_title)
192 path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
193 spreadsheet_id), querystring)
194
195 sheet_md_results = client.get(path=path, api=api, endpoint=stream_name)
196 # sheet_metadata: 1st `sheets` node in results
197 sheet_metadata = sheet_md_results.get('sheets')[0]
198
199 # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results)
200 sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata)
201
202 return sheet_json_schema, columns
203
204
205 def get_abs_path(path):
206 return os.path.join(os.path.dirname(os.path.realpath(__file__)), path)
207
208 def get_schemas(client, spreadsheet_id):
209 schemas = {}
210 field_metadata = {}
211
212 for stream_name, stream_metadata in STREAMS.items():
213 schema_path = get_abs_path('schemas/{}.json'.format(stream_name))
214 with open(schema_path) as file:
215 schema = json.load(file)
216 schemas[stream_name] = schema
217 mdata = metadata.new()
218
219 # Documentation:
220 # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions
221 # Reference:
222 # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44
223 mdata = metadata.get_standard_metadata(
224 schema=schema,
225 key_properties=stream_metadata.get('key_properties', None),
226 valid_replication_keys=stream_metadata.get('replication_keys', None),
227 replication_method=stream_metadata.get('replication_method', None)
228 )
229 field_metadata[stream_name] = mdata
230
231 if stream_name == 'spreadsheet_metadata':
232 api = stream_metadata.get('api', 'sheets')
233 params = stream_metadata.get('params', {})
234 querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()])
235 path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
236 spreadsheet_id), querystring)
237
238 # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet)
239 spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \
240 endpoint=stream_name)
241
242 sheets = spreadsheet_md_results.get('sheets')
243 if sheets:
244 # Loop thru each worksheet in spreadsheet
245 for sheet in sheets:
246 # GET sheet_json_schema for each worksheet (from function above)
247 sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
248 LOGGER.info('columns = {}'.format(columns))
249
250 sheet_title = sheet.get('properties', {}).get('title')
251 schemas[sheet_title] = sheet_json_schema
252 sheet_mdata = metadata.new()
253 sheet_mdata = metadata.get_standard_metadata(
254 schema=sheet_json_schema,
255 key_properties=['__sdc_row'],
256 valid_replication_keys=None,
257 replication_method='FULL_TABLE'
258 )
259 field_metadata[sheet_title] = sheet_mdata
260
261 return schemas, field_metadata