diff options
-rw-r--r-- | CHANGELOG.md | 3 | ||||
-rw-r--r-- | setup.py | 2 | ||||
-rw-r--r-- | tap_google_sheets/schema.py | 18 | ||||
-rw-r--r-- | tap_google_sheets/sync.py | 24 |
4 files changed, 34 insertions, 13 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index fb38c2c..08c4751 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md | |||
@@ -1,5 +1,8 @@ | |||
1 | # Changelog | 1 | # Changelog |
2 | 2 | ||
3 | ## 1.0.3 | ||
4 | * Fix issues: slashes `/` in sheet name 404 error; Discovery malformed sheet error when 2nd row final column value(s) are `NULL`. | ||
5 | |||
3 | ## 1.0.2 | 6 | ## 1.0.2 |
4 | * Skip sheets for which we fail to generate a schema | 7 | * Skip sheets for which we fail to generate a schema |
5 | 8 | ||
@@ -3,7 +3,7 @@ | |||
3 | from setuptools import setup, find_packages | 3 | from setuptools import setup, find_packages |
4 | 4 | ||
5 | setup(name='tap-google-sheets', | 5 | setup(name='tap-google-sheets', |
6 | version='1.0.2', | 6 | version='1.0.3', |
7 | description='Singer.io tap for extracting data from the Google Sheets v4 API', | 7 | description='Singer.io tap for extracting data from the Google Sheets v4 API', |
8 | author='jeff.huth@bytecode.io', | 8 | author='jeff.huth@bytecode.io', |
9 | classifiers=['Programming Language :: Python :: 3 :: Only'], | 9 | classifiers=['Programming Language :: Python :: 3 :: Only'], |
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py index 3e63e75..fcaccf9 100644 --- a/tap_google_sheets/schema.py +++ b/tap_google_sheets/schema.py | |||
@@ -1,5 +1,7 @@ | |||
1 | import os | 1 | import os |
2 | import json | 2 | import json |
3 | import re | ||
4 | import urllib.parse | ||
3 | from collections import OrderedDict | 5 | from collections import OrderedDict |
4 | import singer | 6 | import singer |
5 | from singer import metadata | 7 | from singer import metadata |
@@ -74,8 +76,11 @@ def get_sheet_schema_columns(sheet): | |||
74 | try: | 76 | try: |
75 | first_value = first_values[i] | 77 | first_value = first_values[i] |
76 | except IndexError as err: | 78 | except IndexError as err: |
77 | raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format( | 79 | LOGGER.info('NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2. {}'.format( |
78 | sheet_title, column_name, column_letter, err)) | 80 | sheet_title, column_name, column_letter, err)) |
81 | first_value = {} | ||
82 | first_values.append(first_value) | ||
83 | pass | ||
79 | 84 | ||
80 | column_effective_value = first_value.get('effectiveValue', {}) | 85 | column_effective_value = first_value.get('effectiveValue', {}) |
81 | 86 | ||
@@ -221,20 +226,23 @@ def get_sheet_metadata(sheet, spreadsheet_id, client): | |||
221 | stream_metadata = STREAMS.get(stream_name) | 226 | stream_metadata = STREAMS.get(stream_name) |
222 | api = stream_metadata.get('api', 'sheets') | 227 | api = stream_metadata.get('api', 'sheets') |
223 | params = stream_metadata.get('params', {}) | 228 | params = stream_metadata.get('params', {}) |
229 | sheet_title_encoded = urllib.parse.quote_plus(sheet_title) | ||
230 | sheet_title_escaped = re.escape(sheet_title) | ||
224 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \ | 231 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \ |
225 | params.items()]).replace('{sheet_title}', sheet_title) | 232 | params.items()]).replace('{sheet_title}', sheet_title_encoded) |
226 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ | 233 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ |
227 | spreadsheet_id), querystring) | 234 | spreadsheet_id), querystring) |
228 | 235 | ||
229 | sheet_md_results = client.get(path=path, api=api, endpoint=stream_name) | 236 | sheet_md_results = client.get(path=path, api=api, endpoint=sheet_title_escaped) |
230 | # sheet_metadata: 1st `sheets` node in results | 237 | # sheet_metadata: 1st `sheets` node in results |
231 | sheet_metadata = sheet_md_results.get('sheets')[0] | 238 | sheet_metadata = sheet_md_results.get('sheets')[0] |
232 | 239 | ||
233 | # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results) | 240 | # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results) |
234 | try: | 241 | try: |
235 | sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata) | 242 | sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata) |
236 | except: | 243 | except Exception as err: |
237 | LOGGER.info('SKIPPING Malformed sheet: {}'.format(sheet_title)) | 244 | LOGGER.warning('{}'.format(err)) |
245 | LOGGER.warning('SKIPPING Malformed sheet: {}'.format(sheet_title)) | ||
238 | sheet_json_schema, columns = None, None | 246 | sheet_json_schema, columns = None, None |
239 | 247 | ||
240 | return sheet_json_schema, columns | 248 | return sheet_json_schema, columns |
diff --git a/tap_google_sheets/sync.py b/tap_google_sheets/sync.py index 689be27..3e26bc4 100644 --- a/tap_google_sheets/sync.py +++ b/tap_google_sheets/sync.py | |||
@@ -1,6 +1,8 @@ | |||
1 | import time | 1 | import time |
2 | import math | 2 | import math |
3 | import json | 3 | import json |
4 | import re | ||
5 | import urllib.parse | ||
4 | from datetime import datetime, timedelta | 6 | from datetime import datetime, timedelta |
5 | import pytz | 7 | import pytz |
6 | import singer | 8 | import singer |
@@ -75,10 +77,14 @@ def process_records(catalog, | |||
75 | for record in records: | 77 | for record in records: |
76 | # Transform record for Singer.io | 78 | # Transform record for Singer.io |
77 | with Transformer() as transformer: | 79 | with Transformer() as transformer: |
78 | transformed_record = transformer.transform( | 80 | try: |
79 | record, | 81 | transformed_record = transformer.transform( |
80 | schema, | 82 | record, |
81 | stream_metadata) | 83 | schema, |
84 | stream_metadata) | ||
85 | except Exception as err: | ||
86 | LOGGER.error('{}'.format(err)) | ||
87 | raise RuntimeError(err) | ||
82 | write_record( | 88 | write_record( |
83 | stream_name=stream_name, | 89 | stream_name=stream_name, |
84 | record=transformed_record, | 90 | record=transformed_record, |
@@ -144,22 +150,26 @@ def get_data(stream_name, | |||
144 | if not range_rows: | 150 | if not range_rows: |
145 | range_rows = '' | 151 | range_rows = '' |
146 | # Replace {placeholder} variables in path | 152 | # Replace {placeholder} variables in path |
153 | # Encode stream_name: fixes issue w/ special characters in sheet name | ||
154 | stream_name_escaped = re.escape(stream_name) | ||
155 | stream_name_encoded = urllib.parse.quote_plus(stream_name) | ||
147 | path = endpoint_config.get('path', stream_name).replace( | 156 | path = endpoint_config.get('path', stream_name).replace( |
148 | '{spreadsheet_id}', spreadsheet_id).replace('{sheet_title}', stream_name).replace( | 157 | '{spreadsheet_id}', spreadsheet_id).replace('{sheet_title}', stream_name_encoded).replace( |
149 | '{range_rows}', range_rows) | 158 | '{range_rows}', range_rows) |
150 | params = endpoint_config.get('params', {}) | 159 | params = endpoint_config.get('params', {}) |
151 | api = endpoint_config.get('api', 'sheets') | 160 | api = endpoint_config.get('api', 'sheets') |
152 | # Add in querystring parameters and replace {placeholder} variables | 161 | # Add in querystring parameters and replace {placeholder} variables |
153 | # querystring function ensures parameters are added but not encoded causing API errors | 162 | # querystring function ensures parameters are added but not encoded causing API errors |
154 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]).replace( | 163 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]).replace( |
155 | '{sheet_title}', stream_name) | 164 | '{sheet_title}', stream_name_encoded) |
165 | LOGGER.info('URL: {}/{}?{}'.format(client.base_url, path, querystring)) | ||
156 | data = {} | 166 | data = {} |
157 | time_extracted = utils.now() | 167 | time_extracted = utils.now() |
158 | data = client.get( | 168 | data = client.get( |
159 | path=path, | 169 | path=path, |
160 | api=api, | 170 | api=api, |
161 | params=querystring, | 171 | params=querystring, |
162 | endpoint=stream_name) | 172 | endpoint=stream_name_escaped) |
163 | return data, time_extracted | 173 | return data, time_extracted |
164 | 174 | ||
165 | 175 | ||