5 from collections
import OrderedDict
7 from singer
import metadata
8 from tap_google_sheets
.streams
import STREAMS
10 LOGGER
= singer
.get_logger()
13 # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata
15 # Convert column index to column letter
16 def colnum_string(num
):
19 num
, remainder
= divmod(num
- 1, 26)
20 string
= chr(65 + remainder
) + string
24 # Create sheet_metadata_json with columns from sheet
25 def get_sheet_schema_columns(sheet
):
26 sheet_title
= sheet
.get('properties', {}).get('title')
27 sheet_json_schema
= OrderedDict()
28 data
= next(iter(sheet
.get('data', [])), {})
29 row_data
= data
.get('rowData', [])
32 LOGGER
.info('SKIPPING Empty Sheet: {}'.format(sheet_title
))
34 # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
35 headers
= row_data
[0].get('values', [])
36 first_values
= row_data
[1].get('values', [])
37 # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
41 'additionalProperties': False,
43 '__sdc_spreadsheet_id': {
44 'type': ['null', 'string']
47 'type': ['null', 'integer']
50 'type': ['null', 'integer']
55 header_list
= [] # used for checking uniqueness
60 # Read column headers until end or 2 consecutive skipped headers
61 for header
in headers
:
62 # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
64 column_letter
= colnum_string(column_index
)
65 header_value
= header
.get('formattedValue')
66 if header_value
: # NOT skipped
67 column_is_skipped
= False
69 column_name
= '{}'.format(header_value
)
70 if column_name
in header_list
:
71 raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
72 sheet_title
, column_name
, column_letter
))
73 header_list
.append(column_name
)
77 first_value
= first_values
[i
]
78 except IndexError as err
:
79 LOGGER
.info('NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
80 sheet_title
, column_name
, column_letter
, err
))
82 first_values
.append(first_value
)
85 column_effective_value
= first_value
.get('effectiveValue', {})
88 if column_effective_value
== {}:
89 column_effective_value_type
= 'stringValue'
90 LOGGER
.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
91 sheet_title
, column_name
, column_letter
))
92 LOGGER
.info(' Setting column datatype to STRING')
94 for key
, val
in column_effective_value
.items():
95 if key
in ('numberValue', 'stringValue', 'boolValue'):
96 column_effective_value_type
= key
98 elif key
in ('errorType', 'formulaType'):
100 raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
101 sheet_title
, column_name
, column_letter
, key
, col_val
))
103 column_number_format
= first_values
[i
].get('effectiveFormat', {}).get(
105 column_number_format_type
= column_number_format
.get('type')
107 # Determine datatype for sheet_json_schema
109 # column_effective_value_type = numberValue, stringValue, boolValue;
110 # INVALID: errorType, formulaType
111 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
113 # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
114 # TIME, DATE_TIME, SCIENTIFIC
115 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
117 column_format
= None # Default
118 if column_effective_value
== {}:
119 col_properties
= {'type': ['null', 'string']}
120 column_gs_type
= 'stringValue'
121 LOGGER
.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
122 sheet_title
, column_name
, column_letter
))
123 LOGGER
.info(' Setting column datatype to STRING')
124 elif column_effective_value_type
== 'stringValue':
125 col_properties
= {'type': ['null', 'string']}
126 column_gs_type
= 'stringValue'
127 elif column_effective_value_type
== 'boolValue':
128 col_properties
= {'type': ['null', 'boolean', 'string']}
129 column_gs_type
= 'boolValue'
130 elif column_effective_value_type
== 'numberValue':
131 if column_number_format_type
== 'DATE_TIME':
133 'type': ['null', 'string'],
134 'format': 'date-time'
136 column_gs_type
= 'numberType.DATE_TIME'
137 elif column_number_format_type
== 'DATE':
139 'type': ['null', 'string'],
142 column_gs_type
= 'numberType.DATE'
143 elif column_number_format_type
== 'TIME':
145 'type': ['null', 'string'],
148 column_gs_type
= 'numberType.TIME'
149 elif column_number_format_type
== 'TEXT':
150 col_properties
= {'type': ['null', 'string']}
151 column_gs_type
= 'stringValue'
153 # Interesting - order in the anyOf makes a difference.
154 # Number w/ multipleOf must be listed last, otherwise errors occur.
155 col_properties
= {'type': 'number', 'multipleOf': 1e-15}
156 column_gs_type
= 'numberType'
157 # Catch-all to deal with other types and set to string
158 # column_effective_value_type: formulaValue, errorValue, or other
160 col_properties
= {'type': ['null', 'string']}
161 column_gs_type
= 'unsupportedValue'
162 LOGGER
.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
163 sheet_title
, column_name
, column_letter
, column_effective_value_type
, col_val
))
164 LOGGER
.info('Converting to string.')
166 column_is_skipped
= True
167 skipped
= skipped
+ 1
168 column_index_str
= str(column_index
).zfill(2)
169 column_name
= '__sdc_skip_col_{}'.format(column_index_str
)
170 col_properties
= {'type': ['null', 'string']}
171 column_gs_type
= 'stringValue'
172 LOGGER
.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
173 sheet_title
, column_name
, column_letter
))
174 LOGGER
.info(' This column will be skipped during data loading.')
177 # skipped = 2 consecutive skipped headers
178 # Remove prior_header column_name
179 sheet_json_schema
['properties'].pop(prior_header
, None)
180 LOGGER
.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
181 sheet_title
, column_name
, column_letter
))
187 'columnIndex': column_index
,
188 'columnLetter': column_letter
,
189 'columnName': column_name
,
190 'columnType': column_gs_type
,
191 'columnSkipped': column_is_skipped
193 columns
.append(column
)
195 if column_gs_type
in {'numberType.DATE_TIME', 'numberType.DATE', 'numberType.TIME', 'numberType'}
:
199 {'type': ['null', 'string']}
203 sheet_json_schema
['properties'][column_name
] = col_properties
205 prior_header
= column_name
208 return sheet_json_schema
, columns
211 # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query
212 # endpoint: spreadsheets/{spreadsheet_id}
213 # params: includeGridData = true, ranges = '{sheet_title}'!1:2
214 # This endpoint includes detailed metadata about each cell - incl. data type, formatting, etc.
215 def get_sheet_metadata(sheet
, spreadsheet_id
, client
):
216 sheet_id
= sheet
.get('properties', {}).get('sheetId')
217 sheet_title
= sheet
.get('properties', {}).get('title')
218 LOGGER
.info('sheet_id = {}, sheet_title = {}'.format(sheet_id
, sheet_title
))
220 stream_name
= 'sheet_metadata'
221 stream_metadata
= STREAMS
.get(stream_name
)
222 api
= stream_metadata
.get('api', 'sheets')
223 params
= stream_metadata
.get('params', {})
224 sheet_title_encoded
= urllib
.parse
.quote_plus(sheet_title
)
225 sheet_title_escaped
= re
.escape(sheet_title
)
226 querystring
= '&'.join(['%s=%s' % (key
, value
) for (key
, value
) in \
227 params
.items()]).replace('{sheet_title}', sheet_title_encoded
)
228 path
= '{}?{}'.format(stream_metadata
.get('path').replace('{spreadsheet_id}', \
229 spreadsheet_id
), querystring
)
231 sheet_md_results
= client
.get(path
=path
, api
=api
, endpoint
=sheet_title_escaped
)
232 # sheet_metadata: 1st `sheets` node in results
233 sheet_metadata
= sheet_md_results
.get('sheets')[0]
235 # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results)
237 sheet_json_schema
, columns
= get_sheet_schema_columns(sheet_metadata
)
238 except Exception as err
:
239 LOGGER
.warning('{}'.format(err
))
240 LOGGER
.warning('SKIPPING Malformed sheet: {}'.format(sheet_title
))
241 sheet_json_schema
, columns
= None, None
243 return sheet_json_schema
, columns
246 def get_abs_path(path
):
247 return os
.path
.join(os
.path
.dirname(os
.path
.realpath(__file__
)), path
)
249 def get_schemas(client
, spreadsheet_id
):
253 for stream_name
, stream_metadata
in STREAMS
.items():
254 schema_path
= get_abs_path('schemas/{}.json'.format(stream_name
))
255 with open(schema_path
) as file:
256 schema
= json
.load(file)
257 schemas
[stream_name
] = schema
258 mdata
= metadata
.new()
261 # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions
263 # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44
264 mdata
= metadata
.get_standard_metadata(
266 key_properties
=stream_metadata
.get('key_properties', None),
267 valid_replication_keys
=stream_metadata
.get('replication_keys', None),
268 replication_method
=stream_metadata
.get('replication_method', None)
270 field_metadata
[stream_name
] = mdata
272 if stream_name
== 'spreadsheet_metadata':
273 api
= stream_metadata
.get('api', 'sheets')
274 params
= stream_metadata
.get('params', {})
275 querystring
= '&'.join(['%s=%s' % (key
, value
) for (key
, value
) in params
.items()])
276 path
= '{}?{}'.format(stream_metadata
.get('path').replace('{spreadsheet_id}', \
277 spreadsheet_id
), querystring
)
279 # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet)
280 spreadsheet_md_results
= client
.get(path
=path
, params
=querystring
, api
=api
, \
281 endpoint
=stream_name
)
283 sheets
= spreadsheet_md_results
.get('sheets')
285 # Loop thru each worksheet in spreadsheet
287 # GET sheet_json_schema for each worksheet (from function above)
288 sheet_json_schema
, columns
= get_sheet_metadata(sheet
, spreadsheet_id
, client
)
290 # SKIP empty sheets (where sheet_json_schema and columns are None)
291 if sheet_json_schema
and columns
:
292 sheet_title
= sheet
.get('properties', {}).get('title')
293 schemas
[sheet_title
] = sheet_json_schema
294 sheet_mdata
= metadata
.new()
295 sheet_mdata
= metadata
.get_standard_metadata(
296 schema
=sheet_json_schema
,
297 key_properties
=['__sdc_row'],
298 valid_replication_keys
=None,
299 replication_method
='FULL_TABLE'
301 field_metadata
[sheet_title
] = sheet_mdata
303 return schemas
, field_metadata