]> git.immae.eu Git - github/fretlink/tap-google-sheets.git/blame - tap_google_sheets/schema.py
pylint and testing
[github/fretlink/tap-google-sheets.git] / tap_google_sheets / schema.py
CommitLineData
89643ba6
JH
1import os
2import json
3from collections import OrderedDict
4import singer
5from singer import metadata
6from tap_google_sheets.streams import STREAMS
7
8LOGGER = singer.get_logger()
9
10# Reference:
11# https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata
12
13# Convert column index to column letter
99424fee 14def colnum_string(num):
89643ba6 15 string = ""
99424fee
JH
16 while num > 0:
17 num, remainder = divmod(num - 1, 26)
89643ba6
JH
18 string = chr(65 + remainder) + string
19 return string
20
21
22# Create sheet_metadata_json with columns from sheet
99424fee 23def get_sheet_schema_columns(sheet):
89643ba6
JH
24 sheet_json_schema = OrderedDict()
25 data = next(iter(sheet.get('data', [])), {})
99424fee 26 row_data = data.get('rowData', [])
89643ba6
JH
27 # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
28
29 headers = row_data[0].get('values', [])
30 first_values = row_data[1].get('values', [])
31 # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
32
33 sheet_json_schema['type'] = 'object'
34 sheet_json_schema['additionalProperties'] = False
35 sheet_json_schema = {
36 'type': 'object',
37 'additionalProperties': False,
38 'properties': {
39 '__sdc_spreadsheet_id': {
40 'type': ['null', 'string']
41 },
42 '__sdc_sheet_id': {
43 'type': ['null', 'integer']
44 },
45 '__sdc_row': {
46 'type': ['null', 'integer']
47 }
48 }
49 }
50
51 header_list = [] # used for checking uniqueness
52 columns = []
53 prior_header = None
54 i = 0
55 skipped = 0
56 # Read column headers until end or 2 consecutive skipped headers
57 for header in headers:
58 # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
59 column_index = i + 1
60 column_letter = colnum_string(column_index)
61 header_value = header.get('formattedValue')
62 if header_value: # NOT skipped
63 column_is_skipped = False
64 skipped = 0
65 column_name = '{}'.format(header_value)
66 if column_name in header_list:
67 raise Exception('DUPLICATE HEADER ERROR: {}'.format(column_name))
99424fee 68 header_list.append(column_name)
89643ba6
JH
69
70 first_value = first_values[i]
89643ba6
JH
71
72 column_effective_value = first_value.get('effectiveValue', {})
73 for key in column_effective_value.keys():
74 if key in ('numberValue', 'stringValue', 'boolValue', 'errorType', 'formulaType'):
75 column_effective_value_type = key
76
99424fee
JH
77 column_number_format = first_values[i].get('effectiveFormat', {}).get(
78 'numberFormat', {})
89643ba6
JH
79 column_number_format_type = column_number_format.get('type')
80
81 # Determine datatype for sheet_json_schema
82 #
99424fee
JH
83 # column_effective_value_type = numberValue, stringValue, boolValue;
84 # INVALID: errorType, formulaType
85 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
89643ba6 86 #
99424fee
JH
87 # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
88 # TIME, DATE_TIME, SCIENTIFIC
89 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
89643ba6
JH
90 #
91 column_format = None # Default
92 # column_multiple_of = None # Default
99424fee 93 if column_effective_value_type == 'stringValue':
89643ba6
JH
94 column_type = ['null', 'string']
95 column_gs_type = 'stringValue'
96 elif column_effective_value_type == 'boolValue':
97 column_type = ['null', 'boolean', 'string']
98 column_gs_type = 'boolValue'
99 elif column_effective_value_type == 'numberValue':
100 if column_number_format_type == 'DATE_TIME':
101 column_type = ['null', 'string']
102 column_format = 'date-time'
103 column_gs_type = 'numberType.DATE_TIME'
104 elif column_number_format_type == 'DATE':
105 column_type = ['null', 'string']
106 column_format = 'date'
107 column_gs_type = 'numberType.DATE'
108 elif column_number_format_type == 'TIME':
109 column_type = ['null', 'string']
110 column_format = 'time'
111 column_gs_type = 'numberType.TIME'
112 elif column_number_format_type == 'TEXT':
113 column_type = ['null', 'string']
114 column_gs_type = 'stringValue'
115 else:
116 column_type = ['null', 'number', 'string']
117 column_gs_type = 'numberType'
99424fee
JH
118 elif column_effective_value_type in ('formulaValue', 'errorValue'):
119 raise Exception('INVALID DATA TYPE ERROR: {}, value: {}'.format(column_name, \
120 column_effective_value_type))
89643ba6
JH
121 else: # skipped
122 column_is_skipped = True
123 skipped = skipped + 1
124 column_index_str = str(column_index).zfill(2)
125 column_name = '__sdc_skip_col_{}'.format(column_index_str)
126 column_type = ['null', 'string']
127 column_format = None
128 column_gs_type = 'stringValue'
129
130 if skipped >= 2:
131 # skipped = 2 consecutive skipped headers
132 # Remove prior_header column_name
133 sheet_json_schema['properties'].pop(prior_header, None)
89643ba6
JH
134 break
135
136 else:
137 column = {}
138 column = {
139 'columnIndex': column_index,
140 'columnLetter': column_letter,
141 'columnName': column_name,
142 'columnType': column_gs_type,
143 'columnSkipped': column_is_skipped
144 }
145 columns.append(column)
146
147 sheet_json_schema['properties'][column_name] = column
148 sheet_json_schema['properties'][column_name]['type'] = column_type
149 if column_format:
150 sheet_json_schema['properties'][column_name]['format'] = column_format
151
152 prior_header = column_name
153 i = i + 1
154
155 return sheet_json_schema, columns
156
157
158def get_sheet_metadata(sheet, spreadsheet_id, client):
159 sheet_id = sheet.get('properties', {}).get('sheetId')
160 sheet_title = sheet.get('properties', {}).get('title')
161 LOGGER.info('sheet_id = {}, sheet_title = {}'.format(sheet_id, sheet_title))
162
163 stream_name = 'sheet_metadata'
164 stream_metadata = STREAMS.get(stream_name)
165 api = stream_metadata.get('api', 'sheets')
166 params = stream_metadata.get('params', {})
99424fee
JH
167 querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \
168 params.items()]).replace('{sheet_title}', sheet_title)
169 path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
170 spreadsheet_id), querystring)
89643ba6
JH
171
172 sheet_md_results = client.get(path=path, api=api, endpoint=stream_name)
173 sheet_cols = sheet_md_results.get('sheets')[0]
99424fee 174 sheet_schema, columns = get_sheet_schema_columns(sheet_cols)
89643ba6
JH
175
176 return sheet_schema, columns
177
178
179def get_abs_path(path):
180 return os.path.join(os.path.dirname(os.path.realpath(__file__)), path)
181
182def get_schemas(client, spreadsheet_id):
183 schemas = {}
184 field_metadata = {}
185
186 for stream_name, stream_metadata in STREAMS.items():
187 schema_path = get_abs_path('schemas/{}.json'.format(stream_name))
188 with open(schema_path) as file:
189 schema = json.load(file)
190 schemas[stream_name] = schema
191 mdata = metadata.new()
192
193 # Documentation:
194 # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions
195 # Reference:
196 # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44
197 mdata = metadata.get_standard_metadata(
198 schema=schema,
199 key_properties=stream_metadata.get('key_properties', None),
200 valid_replication_keys=stream_metadata.get('replication_keys', None),
201 replication_method=stream_metadata.get('replication_method', None)
202 )
203 field_metadata[stream_name] = mdata
99424fee 204
89643ba6
JH
205 if stream_name == 'spreadsheet_metadata':
206 api = stream_metadata.get('api', 'sheets')
207 params = stream_metadata.get('params', {})
208 querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()])
99424fee
JH
209 path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
210 spreadsheet_id), querystring)
89643ba6 211
99424fee
JH
212 spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \
213 endpoint=stream_name)
89643ba6
JH
214
215 sheets = spreadsheet_md_results.get('sheets')
216 if sheets:
217 for sheet in sheets:
218 sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
99424fee 219 LOGGER.info('columns = {}'.format(columns))
89643ba6
JH
220
221 sheet_title = sheet.get('properties', {}).get('title')
222 schemas[sheet_title] = sheet_schema
223 sheet_mdata = metadata.new()
224 sheet_mdata = metadata.get_standard_metadata(
225 schema=sheet_schema,
226 key_properties=['__sdc_row'],
227 valid_replication_keys=None,
228 replication_method='FULL_TABLE'
229 )
230 field_metadata[sheet_title] = sheet_mdata
99424fee 231
89643ba6 232 return schemas, field_metadata