3 from collections
import OrderedDict
5 from singer
import metadata
6 from tap_google_sheets
.streams
import STREAMS
8 LOGGER
= singer
.get_logger()
11 # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata
13 # Convert column index to column letter
14 def colnum_string(num
):
17 num
, remainder
= divmod(num
- 1, 26)
18 string
= chr(65 + remainder
) + string
22 # Create sheet_metadata_json with columns from sheet
23 def get_sheet_schema_columns(sheet
):
24 sheet_title
= sheet
.get('properties', {}).get('title')
25 sheet_json_schema
= OrderedDict()
26 data
= next(iter(sheet
.get('data', [])), {})
27 row_data
= data
.get('rowData', [])
30 LOGGER
.info('SKIPPING Empty Sheet: {}'.format(sheet_title
))
32 # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
33 headers
= row_data
[0].get('values', [])
34 first_values
= row_data
[1].get('values', [])
35 # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
39 'additionalProperties': False,
41 '__sdc_spreadsheet_id': {
42 'type': ['null', 'string']
45 'type': ['null', 'integer']
48 'type': ['null', 'integer']
53 header_list
= [] # used for checking uniqueness
58 # Read column headers until end or 2 consecutive skipped headers
59 for header
in headers
:
60 # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
62 column_letter
= colnum_string(column_index
)
63 header_value
= header
.get('formattedValue')
64 if header_value
: # NOT skipped
65 column_is_skipped
= False
67 column_name
= '{}'.format(header_value
)
68 if column_name
in header_list
:
69 raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
70 sheet_title
, column_name
, column_letter
))
71 header_list
.append(column_name
)
75 first_value
= first_values
[i
]
76 except IndexError as err
:
77 raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
78 sheet_title
, column_name
, column_letter
, err
))
80 column_effective_value
= first_value
.get('effectiveValue', {})
83 if column_effective_value
== {}:
84 column_effective_value_type
= 'stringValue'
85 LOGGER
.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
86 sheet_title
, column_name
, column_letter
))
87 LOGGER
.info(' Setting column datatype to STRING')
89 for key
, val
in column_effective_value
.items():
90 if key
in ('numberValue', 'stringValue', 'boolValue'):
91 column_effective_value_type
= key
93 elif key
in ('errorType', 'formulaType'):
95 raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
96 sheet_title
, column_name
, column_letter
, key
, col_val
))
98 column_number_format
= first_values
[i
].get('effectiveFormat', {}).get(
100 column_number_format_type
= column_number_format
.get('type')
102 # Determine datatype for sheet_json_schema
104 # column_effective_value_type = numberValue, stringValue, boolValue;
105 # INVALID: errorType, formulaType
106 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
108 # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
109 # TIME, DATE_TIME, SCIENTIFIC
110 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
112 column_format
= None # Default
113 if column_effective_value
== {}:
114 col_properties
= {'type': ['null', 'string']}
115 column_gs_type
= 'stringValue'
116 LOGGER
.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
117 sheet_title
, column_name
, column_letter
))
118 LOGGER
.info(' Setting column datatype to STRING')
119 elif column_effective_value_type
== 'stringValue':
120 col_properties
= {'type': ['null', 'string']}
121 column_gs_type
= 'stringValue'
122 elif column_effective_value_type
== 'boolValue':
123 col_properties
= {'type': ['null', 'boolean', 'string']}
124 column_gs_type
= 'boolValue'
125 elif column_effective_value_type
== 'numberValue':
126 if column_number_format_type
== 'DATE_TIME':
128 'type': ['null', 'string'],
129 'format': 'date-time'
131 column_gs_type
= 'numberType.DATE_TIME'
132 elif column_number_format_type
== 'DATE':
134 'type': ['null', 'string'],
137 column_gs_type
= 'numberType.DATE'
138 elif column_number_format_type
== 'TIME':
140 'type': ['null', 'string'],
143 column_gs_type
= 'numberType.TIME'
144 elif column_number_format_type
== 'TEXT':
145 col_properties
= {'type': ['null', 'string']}
146 column_gs_type
= 'stringValue'
148 # Interesting - order in the anyOf makes a difference.
149 # Number w/ multipleOf must be listed last, otherwise errors occur.
164 column_gs_type
= 'numberType'
165 # Catch-all to deal with other types and set to string
166 # column_effective_value_type: formulaValue, errorValue, or other
168 col_properties
= {'type': ['null', 'string']}
169 column_gs_type
= 'unsupportedValue'
170 LOGGER
.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
171 sheet_title
, column_name
, column_letter
, column_effective_value_type
, col_val
))
172 LOGGER
.info('Converting to string.')
174 column_is_skipped
= True
175 skipped
= skipped
+ 1
176 column_index_str
= str(column_index
).zfill(2)
177 column_name
= '__sdc_skip_col_{}'.format(column_index_str
)
178 col_properties
= {'type': ['null', 'string']}
179 column_gs_type
= 'stringValue'
180 LOGGER
.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
181 sheet_title
, column_name
, column_letter
))
182 LOGGER
.info(' This column will be skipped during data loading.')
185 # skipped = 2 consecutive skipped headers
186 # Remove prior_header column_name
187 sheet_json_schema
['properties'].pop(prior_header
, None)
188 LOGGER
.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
189 sheet_title
, column_name
, column_letter
))
195 'columnIndex': column_index
,
196 'columnLetter': column_letter
,
197 'columnName': column_name
,
198 'columnType': column_gs_type
,
199 'columnSkipped': column_is_skipped
201 columns
.append(column
)
203 sheet_json_schema
['properties'][column_name
] = col_properties
205 prior_header
= column_name
208 return sheet_json_schema
, columns
211 # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query
212 # endpoint: spreadsheets/{spreadsheet_id}
213 # params: includeGridData = true, ranges = '{sheet_title}'!1:2
214 # This endpoint includes detailed metadata about each cell - incl. data type, formatting, etc.
215 def get_sheet_metadata(sheet
, spreadsheet_id
, client
):
216 sheet_id
= sheet
.get('properties', {}).get('sheetId')
217 sheet_title
= sheet
.get('properties', {}).get('title')
218 LOGGER
.info('sheet_id = {}, sheet_title = {}'.format(sheet_id
, sheet_title
))
220 stream_name
= 'sheet_metadata'
221 stream_metadata
= STREAMS
.get(stream_name
)
222 api
= stream_metadata
.get('api', 'sheets')
223 params
= stream_metadata
.get('params', {})
224 querystring
= '&'.join(['%s=%s' % (key
, value
) for (key
, value
) in \
225 params
.items()]).replace('{sheet_title}', sheet_title
)
226 path
= '{}?{}'.format(stream_metadata
.get('path').replace('{spreadsheet_id}', \
227 spreadsheet_id
), querystring
)
229 sheet_md_results
= client
.get(path
=path
, api
=api
, endpoint
=stream_name
)
230 # sheet_metadata: 1st `sheets` node in results
231 sheet_metadata
= sheet_md_results
.get('sheets')[0]
233 # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results)
235 sheet_json_schema
, columns
= get_sheet_schema_columns(sheet_metadata
)
237 LOGGER
.info('SKIPPING Malformed sheet: {}'.format(sheet_title
))
238 sheet_json_schema
, columns
= None, None
240 return sheet_json_schema
, columns
243 def get_abs_path(path
):
244 return os
.path
.join(os
.path
.dirname(os
.path
.realpath(__file__
)), path
)
246 def get_schemas(client
, spreadsheet_id
):
250 for stream_name
, stream_metadata
in STREAMS
.items():
251 schema_path
= get_abs_path('schemas/{}.json'.format(stream_name
))
252 with open(schema_path
) as file:
253 schema
= json
.load(file)
254 schemas
[stream_name
] = schema
255 mdata
= metadata
.new()
258 # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions
260 # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44
261 mdata
= metadata
.get_standard_metadata(
263 key_properties
=stream_metadata
.get('key_properties', None),
264 valid_replication_keys
=stream_metadata
.get('replication_keys', None),
265 replication_method
=stream_metadata
.get('replication_method', None)
267 field_metadata
[stream_name
] = mdata
269 if stream_name
== 'spreadsheet_metadata':
270 api
= stream_metadata
.get('api', 'sheets')
271 params
= stream_metadata
.get('params', {})
272 querystring
= '&'.join(['%s=%s' % (key
, value
) for (key
, value
) in params
.items()])
273 path
= '{}?{}'.format(stream_metadata
.get('path').replace('{spreadsheet_id}', \
274 spreadsheet_id
), querystring
)
276 # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet)
277 spreadsheet_md_results
= client
.get(path
=path
, params
=querystring
, api
=api
, \
278 endpoint
=stream_name
)
280 sheets
= spreadsheet_md_results
.get('sheets')
282 # Loop thru each worksheet in spreadsheet
284 # GET sheet_json_schema for each worksheet (from function above)
285 sheet_json_schema
, columns
= get_sheet_metadata(sheet
, spreadsheet_id
, client
)
287 # SKIP empty sheets (where sheet_json_schema and columns are None)
288 if sheet_json_schema
and columns
:
289 sheet_title
= sheet
.get('properties', {}).get('title')
290 schemas
[sheet_title
] = sheet_json_schema
291 sheet_mdata
= metadata
.new()
292 sheet_mdata
= metadata
.get_standard_metadata(
293 schema
=sheet_json_schema
,
294 key_properties
=['__sdc_row'],
295 valid_replication_keys
=None,
296 replication_method
='FULL_TABLE'
298 field_metadata
[sheet_title
] = sheet_mdata
300 return schemas
, field_metadata