aboutsummaryrefslogtreecommitdiffhomepage
path: root/tap_google_sheets/schema.py
diff options
context:
space:
mode:
Diffstat (limited to 'tap_google_sheets/schema.py')
-rw-r--r--tap_google_sheets/schema.py365
1 files changed, 185 insertions, 180 deletions
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py
index e319c03..c229d72 100644
--- a/tap_google_sheets/schema.py
+++ b/tap_google_sheets/schema.py
@@ -25,184 +25,188 @@ def get_sheet_schema_columns(sheet):
25 sheet_json_schema = OrderedDict() 25 sheet_json_schema = OrderedDict()
26 data = next(iter(sheet.get('data', [])), {}) 26 data = next(iter(sheet.get('data', [])), {})
27 row_data = data.get('rowData', []) 27 row_data = data.get('rowData', [])
28 # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse 28 if row_data == []:
29 29 # Empty sheet, SKIP
30 headers = row_data[0].get('values', []) 30 LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title))
31 first_values = row_data[1].get('values', []) 31 return None, None
32 # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True))) 32 else:
33 33 # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
34 sheet_json_schema = { 34 headers = row_data[0].get('values', [])
35 'type': 'object', 35 first_values = row_data[1].get('values', [])
36 'additionalProperties': False, 36 # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
37 'properties': { 37
38 '__sdc_spreadsheet_id': { 38 sheet_json_schema = {
39 'type': ['null', 'string'] 39 'type': 'object',
40 }, 40 'additionalProperties': False,
41 '__sdc_sheet_id': { 41 'properties': {
42 'type': ['null', 'integer'] 42 '__sdc_spreadsheet_id': {
43 }, 43 'type': ['null', 'string']
44 '__sdc_row': { 44 },
45 'type': ['null', 'integer'] 45 '__sdc_sheet_id': {
46 'type': ['null', 'integer']
47 },
48 '__sdc_row': {
49 'type': ['null', 'integer']
50 }
46 } 51 }
47 } 52 }
48 } 53
49 54 header_list = [] # used for checking uniqueness
50 header_list = [] # used for checking uniqueness 55 columns = []
51 columns = [] 56 prior_header = None
52 prior_header = None 57 i = 0
53 i = 0 58 skipped = 0
54 skipped = 0 59 # Read column headers until end or 2 consecutive skipped headers
55 # Read column headers until end or 2 consecutive skipped headers 60 for header in headers:
56 for header in headers: 61 # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
57 # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True))) 62 column_index = i + 1
58 column_index = i + 1 63 column_letter = colnum_string(column_index)
59 column_letter = colnum_string(column_index) 64 header_value = header.get('formattedValue')
60 header_value = header.get('formattedValue') 65 if header_value: # NOT skipped
61 if header_value: # NOT skipped 66 column_is_skipped = False
62 column_is_skipped = False 67 skipped = 0
63 skipped = 0 68 column_name = '{}'.format(header_value)
64 column_name = '{}'.format(header_value) 69 if column_name in header_list:
65 if column_name in header_list: 70 raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
66 raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
67 sheet_title, column_name, column_letter))
68 header_list.append(column_name)
69
70 first_value = None
71 try:
72 first_value = first_values[i]
73 except IndexError as err:
74 raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
75 sheet_title, column_name, column_letter, err))
76
77 column_effective_value = first_value.get('effectiveValue', {})
78
79 col_val = None
80 if column_effective_value == {}:
81 column_effective_value_type = 'stringValue'
82 LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
83 sheet_title, column_name, column_letter))
84 LOGGER.info(' Setting column datatype to STRING')
85 else:
86 for key, val in column_effective_value.items():
87 if key in ('numberValue', 'stringValue', 'boolValue'):
88 column_effective_value_type = key
89 col_val = str(val)
90 elif key in ('errorType', 'formulaType'):
91 col_val = str(val)
92 raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
93 sheet_title, column_name, column_letter, key, col_val))
94
95 column_number_format = first_values[i].get('effectiveFormat', {}).get(
96 'numberFormat', {})
97 column_number_format_type = column_number_format.get('type')
98
99 # Determine datatype for sheet_json_schema
100 #
101 # column_effective_value_type = numberValue, stringValue, boolValue;
102 # INVALID: errorType, formulaType
103 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
104 #
105 # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
106 # TIME, DATE_TIME, SCIENTIFIC
107 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
108 #
109 column_format = None # Default
110 if column_effective_value == {}:
111 col_properties = {'type': ['null', 'string']}
112 column_gs_type = 'stringValue'
113 LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
114 sheet_title, column_name, column_letter)) 71 sheet_title, column_name, column_letter))
115 LOGGER.info(' Setting column datatype to STRING') 72 header_list.append(column_name)
116 elif column_effective_value_type == 'stringValue': 73
117 col_properties = {'type': ['null', 'string']} 74 first_value = None
118 column_gs_type = 'stringValue' 75 try:
119 elif column_effective_value_type == 'boolValue': 76 first_value = first_values[i]
120 col_properties = {'type': ['null', 'boolean', 'string']} 77 except IndexError as err:
121 column_gs_type = 'boolValue' 78 raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
122 elif column_effective_value_type == 'numberValue': 79 sheet_title, column_name, column_letter, err))
123 if column_number_format_type == 'DATE_TIME': 80
124 col_properties = { 81 column_effective_value = first_value.get('effectiveValue', {})
125 'type': ['null', 'string'], 82
126 'format': 'date-time' 83 col_val = None
127 } 84 if column_effective_value == {}:
128 column_gs_type = 'numberType.DATE_TIME' 85 column_effective_value_type = 'stringValue'
129 elif column_number_format_type == 'DATE': 86 LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
130 col_properties = { 87 sheet_title, column_name, column_letter))
131 'type': ['null', 'string'], 88 LOGGER.info(' Setting column datatype to STRING')
132 'format': 'date' 89 else:
133 } 90 for key, val in column_effective_value.items():
134 column_gs_type = 'numberType.DATE' 91 if key in ('numberValue', 'stringValue', 'boolValue'):
135 elif column_number_format_type == 'TIME': 92 column_effective_value_type = key
136 col_properties = { 93 col_val = str(val)
137 'type': ['null', 'string'], 94 elif key in ('errorType', 'formulaType'):
138 'format': 'time' 95 col_val = str(val)
139 } 96 raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
140 column_gs_type = 'numberType.TIME' 97 sheet_title, column_name, column_letter, key, col_val))
141 elif column_number_format_type == 'TEXT': 98
99 column_number_format = first_values[i].get('effectiveFormat', {}).get(
100 'numberFormat', {})
101 column_number_format_type = column_number_format.get('type')
102
103 # Determine datatype for sheet_json_schema
104 #
105 # column_effective_value_type = numberValue, stringValue, boolValue;
106 # INVALID: errorType, formulaType
107 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
108 #
109 # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
110 # TIME, DATE_TIME, SCIENTIFIC
111 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
112 #
113 column_format = None # Default
114 if column_effective_value == {}:
115 col_properties = {'type': ['null', 'string']}
116 column_gs_type = 'stringValue'
117 LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
118 sheet_title, column_name, column_letter))
119 LOGGER.info(' Setting column datatype to STRING')
120 elif column_effective_value_type == 'stringValue':
142 col_properties = {'type': ['null', 'string']} 121 col_properties = {'type': ['null', 'string']}
143 column_gs_type = 'stringValue' 122 column_gs_type = 'stringValue'
123 elif column_effective_value_type == 'boolValue':
124 col_properties = {'type': ['null', 'boolean', 'string']}
125 column_gs_type = 'boolValue'
126 elif column_effective_value_type == 'numberValue':
127 if column_number_format_type == 'DATE_TIME':
128 col_properties = {
129 'type': ['null', 'string'],
130 'format': 'date-time'
131 }
132 column_gs_type = 'numberType.DATE_TIME'
133 elif column_number_format_type == 'DATE':
134 col_properties = {
135 'type': ['null', 'string'],
136 'format': 'date'
137 }
138 column_gs_type = 'numberType.DATE'
139 elif column_number_format_type == 'TIME':
140 col_properties = {
141 'type': ['null', 'string'],
142 'format': 'time'
143 }
144 column_gs_type = 'numberType.TIME'
145 elif column_number_format_type == 'TEXT':
146 col_properties = {'type': ['null', 'string']}
147 column_gs_type = 'stringValue'
148 else:
149 # Interesting - order in the anyOf makes a difference.
150 # Number w/ multipleOf must be listed last, otherwise errors occur.
151 col_properties = {
152 'anyOf': [
153 {
154 'type': 'null'
155 },
156 {
157 'type': 'number',
158 'multipleOf': 1e-15
159 },
160 {
161 'type': 'string'
162 }
163 ]
164 }
165 column_gs_type = 'numberType'
166 # Catch-all to deal with other types and set to string
167 # column_effective_value_type: formulaValue, errorValue, or other
144 else: 168 else:
145 # Interesting - order in the anyOf makes a difference. 169 col_properties = {'type': ['null', 'string']}
146 # Number w/ multipleOf must be listed last, otherwise errors occur. 170 column_gs_type = 'unsupportedValue'
147 col_properties = { 171 LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
148 'anyOf': [ 172 sheet_title, column_name, column_letter, column_effective_value_type, col_val))
149 { 173 LOGGER.info('Converting to string.')
150 'type': 'string' 174 else: # skipped
151 }, 175 column_is_skipped = True
152 { 176 skipped = skipped + 1
153 'type': 'null' 177 column_index_str = str(column_index).zfill(2)
154 }, 178 column_name = '__sdc_skip_col_{}'.format(column_index_str)
155 {
156 'type': 'number',
157 'multipleOf': 1e-15
158 }
159 ]
160 }
161 column_gs_type = 'numberType'
162 # Catch-all to deal with other types and set to string
163 # column_effective_value_type: formulaValue, errorValue, or other
164 else:
165 col_properties = {'type': ['null', 'string']} 179 col_properties = {'type': ['null', 'string']}
166 column_gs_type = 'unsupportedValue' 180 column_gs_type = 'stringValue'
167 LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format( 181 LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
168 sheet_title, column_name, column_letter, column_effective_value_type, col_val)) 182 sheet_title, column_name, column_letter))
169 LOGGER.info('Converting to string.') 183 LOGGER.info(' This column will be skipped during data loading.')
170 else: # skipped
171 column_is_skipped = True
172 skipped = skipped + 1
173 column_index_str = str(column_index).zfill(2)
174 column_name = '__sdc_skip_col_{}'.format(column_index_str)
175 col_properties = {'type': ['null', 'string']}
176 column_gs_type = 'stringValue'
177 LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
178 sheet_title, column_name, column_letter))
179 LOGGER.info(' This column will be skipped during data loading.')
180
181 if skipped >= 2:
182 # skipped = 2 consecutive skipped headers
183 # Remove prior_header column_name
184 sheet_json_schema['properties'].pop(prior_header, None)
185 LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
186 sheet_title, column_name, column_letter))
187 break
188
189 else:
190 column = {}
191 column = {
192 'columnIndex': column_index,
193 'columnLetter': column_letter,
194 'columnName': column_name,
195 'columnType': column_gs_type,
196 'columnSkipped': column_is_skipped
197 }
198 columns.append(column)
199 184
200 sheet_json_schema['properties'][column_name] = col_properties 185 if skipped >= 2:
186 # skipped = 2 consecutive skipped headers
187 # Remove prior_header column_name
188 sheet_json_schema['properties'].pop(prior_header, None)
189 LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
190 sheet_title, column_name, column_letter))
191 break
201 192
202 prior_header = column_name 193 else:
203 i = i + 1 194 column = {}
195 column = {
196 'columnIndex': column_index,
197 'columnLetter': column_letter,
198 'columnName': column_name,
199 'columnType': column_gs_type,
200 'columnSkipped': column_is_skipped
201 }
202 columns.append(column)
204 203
205 return sheet_json_schema, columns 204 sheet_json_schema['properties'][column_name] = col_properties
205
206 prior_header = column_name
207 i = i + 1
208
209 return sheet_json_schema, columns
206 210
207 211
208# Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query 212# Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query
@@ -276,17 +280,18 @@ def get_schemas(client, spreadsheet_id):
276 for sheet in sheets: 280 for sheet in sheets:
277 # GET sheet_json_schema for each worksheet (from function above) 281 # GET sheet_json_schema for each worksheet (from function above)
278 sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) 282 sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
279 # LOGGER.info('columns = {}'.format(columns)) 283
280 284 # SKIP empty sheets (where sheet_json_schema and columns are None)
281 sheet_title = sheet.get('properties', {}).get('title') 285 if sheet_json_schema and columns:
282 schemas[sheet_title] = sheet_json_schema 286 sheet_title = sheet.get('properties', {}).get('title')
283 sheet_mdata = metadata.new() 287 schemas[sheet_title] = sheet_json_schema
284 sheet_mdata = metadata.get_standard_metadata( 288 sheet_mdata = metadata.new()
285 schema=sheet_json_schema, 289 sheet_mdata = metadata.get_standard_metadata(
286 key_properties=['__sdc_row'], 290 schema=sheet_json_schema,
287 valid_replication_keys=None, 291 key_properties=['__sdc_row'],
288 replication_method='FULL_TABLE' 292 valid_replication_keys=None,
289 ) 293 replication_method='FULL_TABLE'
290 field_metadata[sheet_title] = sheet_mdata 294 )
295 field_metadata[sheet_title] = sheet_mdata
291 296
292 return schemas, field_metadata 297 return schemas, field_metadata