From 5fc2ead55ea6a67c13e3caeec315454148be593d Mon Sep 17 00:00:00 2001 From: Jeff Huth <39202799+jeffhuth-bytecode@users.noreply.github.com> Date: Wed, 29 Apr 2020 11:29:42 -0700 Subject: v.1.0.3 Fix slashes and discovery errors (#15) Fix issues: slashes `/` in sheet name 404 error; Discovery malformed sheet error when 2nd row final column value(s) are `NULL`. --- CHANGELOG.md | 3 +++ setup.py | 2 +- tap_google_sheets/schema.py | 18 +++++++++++++----- tap_google_sheets/sync.py | 24 +++++++++++++++++------- 4 files changed, 34 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fb38c2c..08c4751 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## 1.0.3 + * Fix issues: slashes `/` in sheet name 404 error; Discovery malformed sheet error when 2nd row final column value(s) are `NULL`. + ## 1.0.2 * Skip sheets for which we fail to generate a schema diff --git a/setup.py b/setup.py index 49c3e3e..ec63fdf 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-google-sheets', - version='1.0.2', + version='1.0.3', description='Singer.io tap for extracting data from the Google Sheets v4 API', author='jeff.huth@bytecode.io', classifiers=['Programming Language :: Python :: 3 :: Only'], diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py index 3e63e75..fcaccf9 100644 --- a/tap_google_sheets/schema.py +++ b/tap_google_sheets/schema.py @@ -1,5 +1,7 @@ import os import json +import re +import urllib.parse from collections import OrderedDict import singer from singer import metadata @@ -74,8 +76,11 @@ def get_sheet_schema_columns(sheet): try: first_value = first_values[i] except IndexError as err: - raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format( + LOGGER.info('NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2. {}'.format( sheet_title, column_name, column_letter, err)) + first_value = {} + first_values.append(first_value) + pass column_effective_value = first_value.get('effectiveValue', {}) @@ -221,20 +226,23 @@ def get_sheet_metadata(sheet, spreadsheet_id, client): stream_metadata = STREAMS.get(stream_name) api = stream_metadata.get('api', 'sheets') params = stream_metadata.get('params', {}) + sheet_title_encoded = urllib.parse.quote_plus(sheet_title) + sheet_title_escaped = re.escape(sheet_title) querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \ - params.items()]).replace('{sheet_title}', sheet_title) + params.items()]).replace('{sheet_title}', sheet_title_encoded) path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ spreadsheet_id), querystring) - sheet_md_results = client.get(path=path, api=api, endpoint=stream_name) + sheet_md_results = client.get(path=path, api=api, endpoint=sheet_title_escaped) # sheet_metadata: 1st `sheets` node in results sheet_metadata = sheet_md_results.get('sheets')[0] # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results) try: sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata) - except: - LOGGER.info('SKIPPING Malformed sheet: {}'.format(sheet_title)) + except Exception as err: + LOGGER.warning('{}'.format(err)) + LOGGER.warning('SKIPPING Malformed sheet: {}'.format(sheet_title)) sheet_json_schema, columns = None, None return sheet_json_schema, columns diff --git a/tap_google_sheets/sync.py b/tap_google_sheets/sync.py index 689be27..3e26bc4 100644 --- a/tap_google_sheets/sync.py +++ b/tap_google_sheets/sync.py @@ -1,6 +1,8 @@ import time import math import json +import re +import urllib.parse from datetime import datetime, timedelta import pytz import singer @@ -75,10 +77,14 @@ def process_records(catalog, for record in records: # Transform record for Singer.io with Transformer() as transformer: - transformed_record = transformer.transform( - record, - schema, - stream_metadata) + try: + transformed_record = transformer.transform( + record, + schema, + stream_metadata) + except Exception as err: + LOGGER.error('{}'.format(err)) + raise RuntimeError(err) write_record( stream_name=stream_name, record=transformed_record, @@ -144,22 +150,26 @@ def get_data(stream_name, if not range_rows: range_rows = '' # Replace {placeholder} variables in path + # Encode stream_name: fixes issue w/ special characters in sheet name + stream_name_escaped = re.escape(stream_name) + stream_name_encoded = urllib.parse.quote_plus(stream_name) path = endpoint_config.get('path', stream_name).replace( - '{spreadsheet_id}', spreadsheet_id).replace('{sheet_title}', stream_name).replace( + '{spreadsheet_id}', spreadsheet_id).replace('{sheet_title}', stream_name_encoded).replace( '{range_rows}', range_rows) params = endpoint_config.get('params', {}) api = endpoint_config.get('api', 'sheets') # Add in querystring parameters and replace {placeholder} variables # querystring function ensures parameters are added but not encoded causing API errors querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]).replace( - '{sheet_title}', stream_name) + '{sheet_title}', stream_name_encoded) + LOGGER.info('URL: {}/{}?{}'.format(client.base_url, path, querystring)) data = {} time_extracted = utils.now() data = client.get( path=path, api=api, params=querystring, - endpoint=stream_name) + endpoint=stream_name_escaped) return data, time_extracted -- cgit v1.2.3