3 from collections
import OrderedDict
5 from singer
import metadata
6 from tap_google_sheets
.streams
import STREAMS
8 LOGGER
= singer
.get_logger()
11 # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata
13 # Convert column index to column letter
17 n
, remainder
= divmod(n
- 1, 26)
18 string
= chr(65 + remainder
) + string
22 # Create sheet_metadata_json with columns from sheet
23 def get_sheet_schema_columns(sheet
, spreadsheet_id
, client
):
24 sheet_json_schema
= OrderedDict()
25 data
= next(iter(sheet
.get('data', [])), {})
26 row_data
= data
.get('rowData',[])
27 # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
29 headers
= row_data
[0].get('values', [])
30 first_values
= row_data
[1].get('values', [])
31 # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
33 sheet_json_schema
['type'] = 'object'
34 sheet_json_schema
['additionalProperties'] = False
37 'additionalProperties': False,
39 '__sdc_spreadsheet_id': {
40 'type': ['null', 'string']
43 'type': ['null', 'integer']
46 'type': ['null', 'integer']
51 header_list
= [] # used for checking uniqueness
56 # Read column headers until end or 2 consecutive skipped headers
57 for header
in headers
:
58 # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
60 column_letter
= colnum_string(column_index
)
61 header_value
= header
.get('formattedValue')
62 if header_value
: # NOT skipped
63 column_is_skipped
= False
65 column_name
= '{}'.format(header_value
)
66 if column_name
in header_list
:
67 raise Exception('DUPLICATE HEADER ERROR: {}'.format(column_name
))
69 header_list
.append(column_name
)
71 first_value
= first_values
[i
]
72 # LOGGER.info('first_value[{}] = {}'.format(i, json.dumps(first_value, indent=2, sort_keys=True)))
74 column_effective_value
= first_value
.get('effectiveValue', {})
75 for key
in column_effective_value
.keys():
76 if key
in ('numberValue', 'stringValue', 'boolValue', 'errorType', 'formulaType'):
77 column_effective_value_type
= key
79 column_number_format
= first_values
[i
].get('effectiveFormat', {}).get('numberFormat', {}
)
80 column_number_format_type
= column_number_format
.get('type')
82 # Determine datatype for sheet_json_schema
84 # column_effective_value_type = numberValue, stringValue, boolValue; INVALID: errorType, formulaType
85 # Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
87 # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, TIME, DATE_TIME, SCIENTIFIC
88 # Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
90 column_format
= None # Default
91 # column_multiple_of = None # Default
92 if column_effective_value_type
in ('formulaValue', 'errorValue'):
93 raise Exception('INVALID DATA TYPE ERROR: {}, value: {}'.format(column_name
))
94 elif column_effective_value_type
== 'stringValue':
95 column_type
= ['null', 'string']
96 column_gs_type
= 'stringValue'
97 elif column_effective_value_type
== 'boolValue':
98 column_type
= ['null', 'boolean', 'string']
99 column_gs_type
= 'boolValue'
100 elif column_effective_value_type
== 'numberValue':
101 if column_number_format_type
== 'DATE_TIME':
102 column_type
= ['null', 'string']
103 column_format
= 'date-time'
104 column_gs_type
= 'numberType.DATE_TIME'
105 elif column_number_format_type
== 'DATE':
106 column_type
= ['null', 'string']
107 column_format
= 'date'
108 column_gs_type
= 'numberType.DATE'
109 elif column_number_format_type
== 'TIME':
110 column_type
= ['null', 'string']
111 column_format
= 'time'
112 column_gs_type
= 'numberType.TIME'
113 elif column_number_format_type
== 'TEXT':
114 column_type
= ['null', 'string']
115 column_gs_type
= 'stringValue'
117 column_type
= ['null', 'number', 'string']
118 column_gs_type
= 'numberType'
121 column_is_skipped
= True
122 skipped
= skipped
+ 1
123 column_index_str
= str(column_index
).zfill(2)
124 column_name
= '__sdc_skip_col_{}'.format(column_index_str
)
125 column_type
= ['null', 'string']
127 column_gs_type
= 'stringValue'
130 # skipped = 2 consecutive skipped headers
131 # Remove prior_header column_name
132 sheet_json_schema
['properties'].pop(prior_header
, None)
139 'columnIndex': column_index
,
140 'columnLetter': column_letter
,
141 'columnName': column_name
,
142 'columnType': column_gs_type
,
143 'columnSkipped': column_is_skipped
145 columns
.append(column
)
147 sheet_json_schema
['properties'][column_name
] = column
148 sheet_json_schema
['properties'][column_name
]['type'] = column_type
150 sheet_json_schema
['properties'][column_name
]['format'] = column_format
152 prior_header
= column_name
155 return sheet_json_schema
, columns
158 def get_sheet_metadata(sheet
, spreadsheet_id
, client
):
159 sheet_id
= sheet
.get('properties', {}).get('sheetId')
160 sheet_title
= sheet
.get('properties', {}).get('title')
161 LOGGER
.info('sheet_id = {}, sheet_title = {}'.format(sheet_id
, sheet_title
))
163 stream_name
= 'sheet_metadata'
164 stream_metadata
= STREAMS
.get(stream_name
)
165 api
= stream_metadata
.get('api', 'sheets')
166 params
= stream_metadata
.get('params', {})
167 querystring
= '&'.join(['%s=%s' % (key
, value
) for (key
, value
) in params
.items()]).replace('{sheet_title}', sheet_title
)
168 path
= '{}?{}'.format(stream_metadata
.get('path').replace('{spreadsheet_id}', spreadsheet_id
), querystring
)
170 sheet_md_results
= client
.get(path
=path
, api
=api
, endpoint
=stream_name
)
171 sheet_cols
= sheet_md_results
.get('sheets')[0]
172 sheet_schema
, columns
= get_sheet_schema_columns(sheet_cols
, spreadsheet_id
, client
)
174 return sheet_schema
, columns
177 def get_abs_path(path
):
178 return os
.path
.join(os
.path
.dirname(os
.path
.realpath(__file__
)), path
)
180 def get_schemas(client
, spreadsheet_id
):
184 for stream_name
, stream_metadata
in STREAMS
.items():
185 schema_path
= get_abs_path('schemas/{}.json'.format(stream_name
))
186 with open(schema_path
) as file:
187 schema
= json
.load(file)
188 schemas
[stream_name
] = schema
189 mdata
= metadata
.new()
192 # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions
194 # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44
195 mdata
= metadata
.get_standard_metadata(
197 key_properties
=stream_metadata
.get('key_properties', None),
198 valid_replication_keys
=stream_metadata
.get('replication_keys', None),
199 replication_method
=stream_metadata
.get('replication_method', None)
201 field_metadata
[stream_name
] = mdata
203 if stream_name
== 'spreadsheet_metadata':
204 api
= stream_metadata
.get('api', 'sheets')
205 params
= stream_metadata
.get('params', {})
206 querystring
= '&'.join(['%s=%s' % (key
, value
) for (key
, value
) in params
.items()])
207 path
= '{}?{}'.format(stream_metadata
.get('path').replace('{spreadsheet_id}', spreadsheet_id
), querystring
)
209 spreadsheet_md_results
= client
.get(path
=path
, params
=querystring
, api
=api
, endpoint
=stream_name
)
211 sheets
= spreadsheet_md_results
.get('sheets')
214 sheet_schema
, columns
= get_sheet_metadata(sheet
, spreadsheet_id
, client
)
215 # LOGGER.info('sheet_schema = {}'.format(json.dumps(sheet_schema, indent=2, sort_keys=True)))
217 sheet_title
= sheet
.get('properties', {}).get('title')
218 schemas
[sheet_title
] = sheet_schema
219 sheet_mdata
= metadata
.new()
220 sheet_mdata
= metadata
.get_standard_metadata(
222 key_properties
=['__sdc_row'],
223 valid_replication_keys
=None,
224 replication_method
='FULL_TABLE'
226 field_metadata
[sheet_title
] = sheet_mdata
228 return schemas
, field_metadata