diff options
author | Paul B <paul@bonaud.fr> | 2020-11-19 12:35:22 +0100 |
---|---|---|
committer | Paul B <paul@bonaud.fr> | 2020-11-21 00:32:35 +0100 |
commit | 4bf194076d39d516c3cd0f5c3559954ebe8a12f2 (patch) | |
tree | 422588a014088598ad93884f86224a90ee4333fa | |
parent | 1080d5ece1d90464c448c7e3f8dc58410fad0601 (diff) | |
download | tap-google-sheets-4bf194076d39d516c3cd0f5c3559954ebe8a12f2.tar.gz tap-google-sheets-4bf194076d39d516c3cd0f5c3559954ebe8a12f2.tar.zst tap-google-sheets-4bf194076d39d516c3cd0f5c3559954ebe8a12f2.zip |
feat: use the official Google API python library
These changes will make use of the official `google-api-python-client`
library instead of relying on manual HTTP requests.
Therer are two main advantages of these changes:
- the Tap doesn't need to worry about the Google API interaction
details as its hidden away by the Google official lib.
- We can use the authentication helpers from the lib to ease the
credentials management for the user. In that way the current PR
implements two auth mean: installed OAuth client authentication or
Service Accounts authentication.
The only downside of this change is that it breaks the current
`config.json` parameters for existing users.
-rw-r--r-- | README.md | 34 | ||||
-rw-r--r-- | config.json.example | 7 | ||||
-rwxr-xr-x[-rw-r--r--] | setup.py | 6 | ||||
-rwxr-xr-x[-rw-r--r--] | tap_google_sheets/__init__.py | 12 | ||||
-rw-r--r-- | tap_google_sheets/client.py | 239 | ||||
-rw-r--r-- | tap_google_sheets/schema.py | 24 | ||||
-rw-r--r-- | tap_google_sheets/streams.py | 28 | ||||
-rw-r--r-- | tap_google_sheets/sync.py | 37 |
8 files changed, 176 insertions, 211 deletions
@@ -67,16 +67,21 @@ This tap: | |||
67 | - Process/send records to target | 67 | - Process/send records to target |
68 | 68 | ||
69 | ## Authentication | 69 | ## Authentication |
70 | The [**Google Sheets Setup & Authentication**](https://drive.google.com/open?id=1FojlvtLwS0-BzGS37R0jEXtwSHqSiO1Uw-7RKQQO-C4) Google Doc provides instructions show how to configure the Google Cloud API credentials to enable Google Drive and Google Sheets APIs, configure Google Cloud to authorize/verify your domain ownership, generate an API key (client_id, client_secret), authenticate and generate a refresh_token, and prepare your tap config.json with the necessary parameters. | 70 | |
71 | - Enable Googe Drive APIs and Authorization Scope: https://www.googleapis.com/auth/drive.metadata.readonly | 71 | You will need a Google developer project to use this tool. After [creating a project](https://console.developers.google.com/projectcreate) (or selecting an existing one) in your Google developers console the authentication can be configured in two different ways: |
72 | - Enable Google Sheets API and Authorization Scope: https://www.googleapis.com/auth/spreadsheets.readonly | 72 | |
73 | - Tap config.json parameters: | 73 | - Via an OAuth client which will ask the user to login to its Google user account. |
74 | - client_id: identifies your application | 74 | |
75 | - client_secret: authenticates your application | 75 | Please check the [“Creating application credentials”](https://github.com/googleapis/google-api-python-client/blob/d0110cf4f7aaa93d6f56fc028cd6a1e3d8dd300a/docs/oauth-installed.md#creating-application-credentials) paragraph of the Google Python library to download your Google credentials file. |
76 | - refresh_token: generates an access token to authorize your session | 76 | |
77 | - spreadsheet_id: unique identifier for each spreadsheet in Google Drive | 77 | - Via a Service account (ideal for server-to-server communication) |
78 | - start_date: absolute minimum start date to check file modified | 78 | |
79 | - user_agent: tap-name and email address; identifies your application in the Remote API server logs | 79 | Please check the [“Creating a service account”](https://github.com/googleapis/google-api-python-client/blob/d0110cf4f7aaa93d6f56fc028cd6a1e3d8dd300a/docs/oauth-server.md#creating-a-service-account) paragraph of the Google Python library to download your Google Service Account key file. |
80 | |||
81 | - Tap `config.json` parameters: | ||
82 | - `credentials_file`: the path to a valid Google credentials file (Either an OAuth client secrets file or a Service Account key file) | ||
83 | - `spreadsheet_id`: unique identifier for each spreadsheet in Google Drive | ||
84 | - `start_date`: absolute minimum start date to check file modified | ||
80 | 85 | ||
81 | ## Quick Start | 86 | ## Quick Start |
82 | 87 | ||
@@ -103,16 +108,13 @@ The [**Google Sheets Setup & Authentication**](https://drive.google.com/open?id= | |||
103 | - [singer-tools](https://github.com/singer-io/singer-tools) | 108 | - [singer-tools](https://github.com/singer-io/singer-tools) |
104 | - [target-stitch](https://github.com/singer-io/target-stitch) | 109 | - [target-stitch](https://github.com/singer-io/target-stitch) |
105 | 110 | ||
106 | 3. Create your tap's `config.json` file. Include the client_id, client_secret, refresh_token, site_urls (website URL properties in a comma delimited list; do not include the domain-level property in the list), start_date (UTC format), and user_agent (tap name with the api user email address). | 111 | 3. Create your tap's `config.json` file. Include the `credentials_file` path to your google secrets file as described in the [Authentication](#authentication) paragraph. |
107 | 112 | ||
108 | ```json | 113 | ```json |
109 | { | 114 | { |
110 | "client_id": "YOUR_CLIENT_ID", | 115 | "credentials_file": "PATH_TO_YOUR_GOOGLE_CREDENTIALS_FILE", |
111 | "client_secret": "YOUR_CLIENT_SECRET", | ||
112 | "refresh_token": "YOUR_REFRESH_TOKEN", | ||
113 | "spreadsheet_id": "YOUR_GOOGLE_SPREADSHEET_ID", | 116 | "spreadsheet_id": "YOUR_GOOGLE_SPREADSHEET_ID", |
114 | "start_date": "2019-01-01T00:00:00Z", | 117 | "start_date": "2019-01-01T00:00:00Z" |
115 | "user_agent": "tap-google-sheets <api_user_email@example.com>" | ||
116 | } | 118 | } |
117 | ``` | 119 | ``` |
118 | 120 | ||
diff --git a/config.json.example b/config.json.example index 159aafb..fba0b17 100644 --- a/config.json.example +++ b/config.json.example | |||
@@ -1,8 +1,5 @@ | |||
1 | { | 1 | { |
2 | "client_id": "YOUR_CLIENT_ID", | 2 | "credentials_file": "client-secrets.json", |
3 | "client_secret": "YOUR_CLIENT_SECRET", | ||
4 | "refresh_token": "YOUR_REFRESH_TOKEN", | ||
5 | "spreadsheet_id": "YOUR_GOOGLE_SPREADSHEET_ID", | 3 | "spreadsheet_id": "YOUR_GOOGLE_SPREADSHEET_ID", |
6 | "start_date": "2019-01-01T00:00:00Z", | 4 | "start_date": "2019-01-01T00:00:00Z" |
7 | "user_agent": "tap-google-search-console <api_user_email@example.com>" | ||
8 | } | 5 | } |
@@ -10,8 +10,10 @@ setup(name='tap-google-sheets', | |||
10 | py_modules=['tap_google_sheets'], | 10 | py_modules=['tap_google_sheets'], |
11 | install_requires=[ | 11 | install_requires=[ |
12 | 'backoff==1.8.0', | 12 | 'backoff==1.8.0', |
13 | 'requests==2.22.0', | 13 | 'singer-python==5.9.0', |
14 | 'singer-python==5.9.0' | 14 | 'google-api-python-client==1.12.5', |
15 | 'google-auth==1.23.0', | ||
16 | 'google-auth-oauthlib==0.4.2', | ||
15 | ], | 17 | ], |
16 | extras_require={ | 18 | extras_require={ |
17 | 'dev': [ | 19 | 'dev': [ |
diff --git a/tap_google_sheets/__init__.py b/tap_google_sheets/__init__.py index f97d4b8..15db05f 100644..100755 --- a/tap_google_sheets/__init__.py +++ b/tap_google_sheets/__init__.py | |||
@@ -12,12 +12,9 @@ from tap_google_sheets.sync import sync | |||
12 | LOGGER = singer.get_logger() | 12 | LOGGER = singer.get_logger() |
13 | 13 | ||
14 | REQUIRED_CONFIG_KEYS = [ | 14 | REQUIRED_CONFIG_KEYS = [ |
15 | 'client_id', | 15 | 'credentials_file', |
16 | 'client_secret', | ||
17 | 'refresh_token', | ||
18 | 'spreadsheet_id', | 16 | 'spreadsheet_id', |
19 | 'start_date', | 17 | 'start_date' |
20 | 'user_agent' | ||
21 | ] | 18 | ] |
22 | 19 | ||
23 | def do_discover(client, spreadsheet_id): | 20 | def do_discover(client, spreadsheet_id): |
@@ -33,10 +30,7 @@ def main(): | |||
33 | 30 | ||
34 | parsed_args = singer.utils.parse_args(REQUIRED_CONFIG_KEYS) | 31 | parsed_args = singer.utils.parse_args(REQUIRED_CONFIG_KEYS) |
35 | 32 | ||
36 | with GoogleClient(parsed_args.config['client_id'], | 33 | with GoogleClient(parsed_args.config['credentials_file']) as client: |
37 | parsed_args.config['client_secret'], | ||
38 | parsed_args.config['refresh_token'], | ||
39 | parsed_args.config['user_agent']) as client: | ||
40 | 34 | ||
41 | state = {} | 35 | state = {} |
42 | if parsed_args.state: | 36 | if parsed_args.state: |
diff --git a/tap_google_sheets/client.py b/tap_google_sheets/client.py index 4f38352..0cbad98 100644 --- a/tap_google_sheets/client.py +++ b/tap_google_sheets/client.py | |||
@@ -1,16 +1,21 @@ | |||
1 | from datetime import datetime, timedelta | 1 | from datetime import datetime, timedelta |
2 | from collections import OrderedDict | 2 | from collections import OrderedDict |
3 | import backoff | 3 | import backoff |
4 | import requests | ||
5 | import singer | 4 | import singer |
5 | import logging | ||
6 | import pickle | ||
7 | import json | ||
8 | import os | ||
6 | from singer import metrics | 9 | from singer import metrics |
7 | from singer import utils | 10 | from singer import utils |
11 | from google.oauth2 import service_account | ||
12 | from google_auth_oauthlib.flow import InstalledAppFlow | ||
13 | from google.auth.transport.requests import Request | ||
14 | from googleapiclient.errors import HttpError | ||
15 | import googleapiclient.discovery | ||
8 | 16 | ||
9 | BASE_URL = 'https://www.googleapis.com' | ||
10 | GOOGLE_TOKEN_URI = 'https://oauth2.googleapis.com/token' | ||
11 | LOGGER = singer.get_logger() | 17 | LOGGER = singer.get_logger() |
12 | 18 | ||
13 | |||
14 | class Server5xxError(Exception): | 19 | class Server5xxError(Exception): |
15 | pass | 20 | pass |
16 | 21 | ||
@@ -101,90 +106,85 @@ ERROR_CODE_EXCEPTION_MAPPING = { | |||
101 | 428: GooglePreconditionRequiredError, | 106 | 428: GooglePreconditionRequiredError, |
102 | 500: GoogleInternalServiceError} | 107 | 500: GoogleInternalServiceError} |
103 | 108 | ||
104 | |||
105 | def get_exception_for_error_code(error_code): | ||
106 | return ERROR_CODE_EXCEPTION_MAPPING.get(error_code, GoogleError) | ||
107 | |||
108 | def raise_for_error(response): | ||
109 | try: | ||
110 | response.raise_for_status() | ||
111 | except (requests.HTTPError, requests.ConnectionError) as error: | ||
112 | try: | ||
113 | content_length = len(response.content) | ||
114 | if content_length == 0: | ||
115 | # There is nothing we can do here since Google has neither sent | ||
116 | # us a 2xx response nor a response content. | ||
117 | return | ||
118 | response = response.json() | ||
119 | if ('error' in response) or ('errorCode' in response): | ||
120 | message = '%s: %s' % (response.get('error', str(error)), | ||
121 | response.get('message', 'Unknown Error')) | ||
122 | error_code = response.get('error', {}).get('code') | ||
123 | ex = get_exception_for_error_code(error_code) | ||
124 | raise ex(message) | ||
125 | raise GoogleError(error) | ||
126 | except (ValueError, TypeError): | ||
127 | raise GoogleError(error) | ||
128 | |||
129 | class GoogleClient: # pylint: disable=too-many-instance-attributes | 109 | class GoogleClient: # pylint: disable=too-many-instance-attributes |
130 | def __init__(self, | 110 | SCOPES = [ |
131 | client_id, | 111 | "https://www.googleapis.com/auth/drive.metadata.readonly", |
132 | client_secret, | 112 | "https://www.googleapis.com/auth/spreadsheets.readonly" |
133 | refresh_token, | 113 | ] |
134 | user_agent=None): | 114 | |
135 | self.__client_id = client_id | 115 | def __init__(self, credentials_file): |
136 | self.__client_secret = client_secret | 116 | self.__credentials = self.fetchCredentials(credentials_file) |
137 | self.__refresh_token = refresh_token | 117 | self.__sheets_service = googleapiclient.discovery.build( |
138 | self.__user_agent = user_agent | 118 | 'sheets', |
139 | self.__access_token = None | 119 | 'v4', |
140 | self.__expires = None | 120 | credentials=self.__credentials, |
141 | self.__session = requests.Session() | 121 | cache_discovery=False |
142 | self.base_url = None | 122 | ) |
143 | 123 | self.__drive_service = googleapiclient.discovery.build( | |
124 | 'drive', | ||
125 | 'v3', | ||
126 | credentials=self.__credentials, | ||
127 | cache_discovery=False | ||
128 | ) | ||
129 | |||
130 | def fetchCredentials(self, credentials_file): | ||
131 | LOGGER.debug('authenticate with google') | ||
132 | data = None | ||
133 | |||
134 | # Check a credentials file exist | ||
135 | if not os.path.exists(credentials_file): | ||
136 | raise Exception("The configured Google credentials file {} doesn't exist".format(credentials_file)) | ||
137 | |||
138 | # Load credentials json file | ||
139 | with open(credentials_file) as json_file: | ||
140 | data = json.load(json_file) | ||
141 | |||
142 | if data.get('type', '') == 'service_account': | ||
143 | return self.fetchServiceAccountCredentials(credentials_file) | ||
144 | elif data.get('installed'): | ||
145 | return self.fetchInstalledOAuthCredentials(credentials_file) | ||
146 | else: | ||
147 | raise Exception("""This Google credentials file is not yet recognize. | ||
148 | |||
149 | Please use either: | ||
150 | - a Service Account (https://github.com/googleapis/google-api-python-client/blob/d0110cf4f7aaa93d6f56fc028cd6a1e3d8dd300a/docs/oauth-server.md) | ||
151 | - an installed OAuth client (https://github.com/googleapis/google-api-python-client/blob/d0110cf4f7aaa93d6f56fc028cd6a1e3d8dd300a/docs/oauth-installed.md)""" | ||
152 | ) | ||
153 | |||
154 | def fetchServiceAccountCredentials(self, credentials_file): | ||
155 | # The service account credentials file can be used for server-to-server applications | ||
156 | return service_account.Credentials.from_service_account_file( | ||
157 | credentials_file, scopes=GoogleClient.SCOPES) | ||
158 | |||
159 | def fetchInstalledOAuthCredentials(self, credentials_file): | ||
160 | creds = None | ||
161 | |||
162 | # The file token.pickle stores the user's access and refresh tokens, and is | ||
163 | # created automatically when the authorization flow completes for the first | ||
164 | # time. | ||
165 | if os.path.exists('token.pickle'): | ||
166 | with open('token.pickle', 'rb') as token: | ||
167 | creds = pickle.load(token) | ||
168 | |||
169 | # If there are no (valid) credentials available, let the user log in. | ||
170 | if not creds or not creds.valid: | ||
171 | if creds and creds.expired and creds.refresh_token: | ||
172 | creds.refresh(Request()) | ||
173 | else: | ||
174 | flow = InstalledAppFlow.from_client_secrets_file( | ||
175 | credentials_file, GoogleClient.SCOPES) | ||
176 | creds = flow.run_local_server(port=0) | ||
177 | # Save the credentials for the next run | ||
178 | with open('token.pickle', 'wb') as token: | ||
179 | pickle.dump(creds, token) | ||
180 | |||
181 | return creds | ||
144 | 182 | ||
145 | def __enter__(self): | 183 | def __enter__(self): |
146 | self.get_access_token() | ||
147 | return self | 184 | return self |
148 | 185 | ||
149 | def __exit__(self, exception_type, exception_value, traceback): | 186 | def __exit__(self, exception_type, exception_value, traceback): |
150 | self.__session.close() | 187 | LOGGER.debug('exiting google client') |
151 | |||
152 | |||
153 | @backoff.on_exception(backoff.expo, | ||
154 | Server5xxError, | ||
155 | max_tries=5, | ||
156 | factor=2) | ||
157 | def get_access_token(self): | ||
158 | # The refresh_token never expires and may be used many times to generate each access_token | ||
159 | # Since the refresh_token does not expire, it is not included in get access_token response | ||
160 | if self.__access_token is not None and self.__expires > datetime.utcnow(): | ||
161 | return | ||
162 | |||
163 | headers = {} | ||
164 | if self.__user_agent: | ||
165 | headers['User-Agent'] = self.__user_agent | ||
166 | |||
167 | response = self.__session.post( | ||
168 | url=GOOGLE_TOKEN_URI, | ||
169 | headers=headers, | ||
170 | data={ | ||
171 | 'grant_type': 'refresh_token', | ||
172 | 'client_id': self.__client_id, | ||
173 | 'client_secret': self.__client_secret, | ||
174 | 'refresh_token': self.__refresh_token, | ||
175 | }) | ||
176 | |||
177 | if response.status_code >= 500: | ||
178 | raise Server5xxError() | ||
179 | |||
180 | if response.status_code != 200: | ||
181 | raise_for_error(response) | ||
182 | |||
183 | data = response.json() | ||
184 | self.__access_token = data['access_token'] | ||
185 | self.__expires = datetime.utcnow() + timedelta(seconds=data['expires_in']) | ||
186 | LOGGER.info('Authorized, token expires = {}'.format(self.__expires)) | ||
187 | |||
188 | 188 | ||
189 | # Rate Limit: https://developers.google.com/sheets/api/limits | 189 | # Rate Limit: https://developers.google.com/sheets/api/limits |
190 | # 100 request per 100 seconds per User | 190 | # 100 request per 100 seconds per User |
@@ -193,53 +193,48 @@ class GoogleClient: # pylint: disable=too-many-instance-attributes | |||
193 | max_tries=7, | 193 | max_tries=7, |
194 | factor=3) | 194 | factor=3) |
195 | @utils.ratelimit(100, 100) | 195 | @utils.ratelimit(100, 100) |
196 | def request(self, method, path=None, url=None, api=None, **kwargs): | 196 | def request(self, endpoint=None, params={}, **kwargs): |
197 | self.get_access_token() | 197 | formatted_params = {} |
198 | self.base_url = 'https://sheets.googleapis.com/v4' | 198 | for (key, value) in params.items(): |
199 | if api == 'files': | 199 | # API parameters interpolation |
200 | self.base_url = 'https://www.googleapis.com/drive/v3' | 200 | # will raise a KeyError in case a necessary argument is missing |
201 | 201 | formatted_params[key] = value.format(**kwargs) | |
202 | if not url and path: | 202 | |
203 | url = '{}/{}'.format(self.base_url, path) | 203 | # Call the correct Google API depending on the stream name |
204 | 204 | if endpoint == 'spreadsheet_metadata' or endpoint == 'sheet_metadata': | |
205 | # endpoint = stream_name (from sync.py API call) | 205 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/get |
206 | if 'endpoint' in kwargs: | 206 | request = self.__sheets_service.spreadsheets().get(**formatted_params) |
207 | endpoint = kwargs['endpoint'] | 207 | elif endpoint == 'sheets_loaded': |
208 | del kwargs['endpoint'] | 208 | # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets.values/get |
209 | request = self.__sheets_service.spreadsheets().values().get(**formatted_params) | ||
210 | elif endpoint == 'file_metadata': | ||
211 | # https://developers.google.com/drive/api/v3/reference/files/get | ||
212 | request = self.__drive_service.files().get(**formatted_params) | ||
209 | else: | 213 | else: |
210 | endpoint = None | 214 | raise Exception('{} not implemented yet!'.format(endpoint)) |
211 | LOGGER.info('{} URL = {}'.format(endpoint, url)) | ||
212 | |||
213 | if 'headers' not in kwargs: | ||
214 | kwargs['headers'] = {} | ||
215 | kwargs['headers']['Authorization'] = 'Bearer {}'.format(self.__access_token) | ||
216 | 215 | ||
217 | if self.__user_agent: | 216 | with metrics.http_request_timer(endpoint) as timer: |
218 | kwargs['headers']['User-Agent'] = self.__user_agent | 217 | error = None |
218 | status_code = 400 | ||
219 | 219 | ||
220 | if method == 'POST': | 220 | try: |
221 | kwargs['headers']['Content-Type'] = 'application/json' | 221 | response = request.execute() |
222 | status_code = 200 | ||
223 | except HttpError as e: | ||
224 | status_code = e.resp.status or status_code | ||
225 | error = e | ||
222 | 226 | ||
223 | with metrics.http_request_timer(endpoint) as timer: | 227 | timer.tags[metrics.Tag.http_status_code] = status_code |
224 | response = self.__session.request(method, url, **kwargs) | ||
225 | timer.tags[metrics.Tag.http_status_code] = response.status_code | ||
226 | 228 | ||
227 | if response.status_code >= 500: | 229 | if status_code >= 500: |
228 | raise Server5xxError() | 230 | raise Server5xxError() |
229 | 231 | ||
230 | #Use retry functionality in backoff to wait and retry if | 232 | # Use retry functionality in backoff to wait and retry if |
231 | #response code equals 429 because rate limit has been exceeded | 233 | # response code equals 429 because rate limit has been exceeded |
232 | if response.status_code == 429: | 234 | if status_code == 429: |
233 | raise Server429Error() | 235 | raise Server429Error() |
234 | 236 | ||
235 | if response.status_code != 200: | 237 | if status_code != 200: |
236 | raise_for_error(response) | 238 | raise error |
237 | |||
238 | # Ensure keys and rows are ordered as received from API | ||
239 | return response.json(object_pairs_hook=OrderedDict) | ||
240 | |||
241 | def get(self, path, api, **kwargs): | ||
242 | return self.request(method='GET', path=path, api=api, **kwargs) | ||
243 | 239 | ||
244 | def post(self, path, api, **kwargs): | 240 | return response |
245 | return self.request(method='POST', path=path, api=api, **kwargs) | ||
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py index fcaccf9..56d2fb9 100644 --- a/tap_google_sheets/schema.py +++ b/tap_google_sheets/schema.py | |||
@@ -224,16 +224,13 @@ def get_sheet_metadata(sheet, spreadsheet_id, client): | |||
224 | 224 | ||
225 | stream_name = 'sheet_metadata' | 225 | stream_name = 'sheet_metadata' |
226 | stream_metadata = STREAMS.get(stream_name) | 226 | stream_metadata = STREAMS.get(stream_name) |
227 | api = stream_metadata.get('api', 'sheets') | ||
228 | params = stream_metadata.get('params', {}) | 227 | params = stream_metadata.get('params', {}) |
229 | sheet_title_encoded = urllib.parse.quote_plus(sheet_title) | 228 | |
230 | sheet_title_escaped = re.escape(sheet_title) | 229 | # GET sheet_metadata |
231 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \ | 230 | sheet_md_results = client.request(endpoint=stream_name, |
232 | params.items()]).replace('{sheet_title}', sheet_title_encoded) | 231 | spreadsheet_id=spreadsheet_id, |
233 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ | 232 | sheet_title=sheet_title, |
234 | spreadsheet_id), querystring) | 233 | params=params) |
235 | |||
236 | sheet_md_results = client.get(path=path, api=api, endpoint=sheet_title_escaped) | ||
237 | # sheet_metadata: 1st `sheets` node in results | 234 | # sheet_metadata: 1st `sheets` node in results |
238 | sheet_metadata = sheet_md_results.get('sheets')[0] | 235 | sheet_metadata = sheet_md_results.get('sheets')[0] |
239 | 236 | ||
@@ -275,15 +272,12 @@ def get_schemas(client, spreadsheet_id): | |||
275 | field_metadata[stream_name] = mdata | 272 | field_metadata[stream_name] = mdata |
276 | 273 | ||
277 | if stream_name == 'spreadsheet_metadata': | 274 | if stream_name == 'spreadsheet_metadata': |
278 | api = stream_metadata.get('api', 'sheets') | ||
279 | params = stream_metadata.get('params', {}) | 275 | params = stream_metadata.get('params', {}) |
280 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]) | ||
281 | path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ | ||
282 | spreadsheet_id), querystring) | ||
283 | 276 | ||
284 | # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet) | 277 | # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet) |
285 | spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \ | 278 | spreadsheet_md_results = client.request(endpoint=stream_name, |
286 | endpoint=stream_name) | 279 | spreadsheet_id=spreadsheet_id, |
280 | params=params) | ||
287 | 281 | ||
288 | sheets = spreadsheet_md_results.get('sheets') | 282 | sheets = spreadsheet_md_results.get('sheets') |
289 | if sheets: | 283 | if sheets: |
diff --git a/tap_google_sheets/streams.py b/tap_google_sheets/streams.py index ad5529f..f7bf8ac 100644 --- a/tap_google_sheets/streams.py +++ b/tap_google_sheets/streams.py | |||
@@ -2,9 +2,7 @@ from collections import OrderedDict | |||
2 | 2 | ||
3 | # streams: API URL endpoints to be called | 3 | # streams: API URL endpoints to be called |
4 | # properties: | 4 | # properties: |
5 | # <root node>: Plural stream name for the endpoint | 5 | # <root node>: Plural stream name which will condition the endpoint called |
6 | # path: API endpoint relative path, when added to the base URL, creates the full path, | ||
7 | # default = stream_name | ||
8 | # key_properties: Primary key fields for identifying an endpoint record. | 6 | # key_properties: Primary key fields for identifying an endpoint record. |
9 | # replication_method: INCREMENTAL or FULL_TABLE | 7 | # replication_method: INCREMENTAL or FULL_TABLE |
10 | # replication_keys: bookmark_field(s), typically a date-time, used for filtering the results | 8 | # replication_keys: bookmark_field(s), typically a date-time, used for filtering the results |
@@ -15,51 +13,51 @@ from collections import OrderedDict | |||
15 | 13 | ||
16 | # file_metadata: Queries Google Drive API to get file information and see if file has been modified | 14 | # file_metadata: Queries Google Drive API to get file information and see if file has been modified |
17 | # Provides audit info about who and when last changed the file. | 15 | # Provides audit info about who and when last changed the file. |
16 | # cf https://developers.google.com/drive/api/v3/reference/files/get | ||
18 | FILE_METADATA = { | 17 | FILE_METADATA = { |
19 | "api": "files", | ||
20 | "path": "files/{spreadsheet_id}", | ||
21 | "key_properties": ["id"], | 18 | "key_properties": ["id"], |
22 | "replication_method": "INCREMENTAL", | 19 | "replication_method": "INCREMENTAL", |
23 | "replication_keys": ["modifiedTime"], | 20 | "replication_keys": ["modifiedTime"], |
24 | "params": { | 21 | "params": { |
22 | "fileId": "{spreadsheet_id}", | ||
25 | "fields": "id,name,createdTime,modifiedTime,version,teamDriveId,driveId,lastModifyingUser" | 23 | "fields": "id,name,createdTime,modifiedTime,version,teamDriveId,driveId,lastModifyingUser" |
26 | } | 24 | } |
27 | } | 25 | } |
28 | 26 | ||
29 | # spreadsheet_metadata: Queries spreadsheet to get basic information on spreadhsheet and sheets | 27 | # spreadsheet_metadata: Queries spreadsheet to get basic information on spreadhsheet and sheets |
28 | # cf https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/get | ||
30 | SPREADSHEET_METADATA = { | 29 | SPREADSHEET_METADATA = { |
31 | "api": "sheets", | ||
32 | "path": "spreadsheets/{spreadsheet_id}", | ||
33 | "key_properties": ["spreadsheetId"], | 30 | "key_properties": ["spreadsheetId"], |
34 | "replication_method": "FULL_TABLE", | 31 | "replication_method": "FULL_TABLE", |
35 | "params": { | 32 | "params": { |
36 | "includeGridData": "false" | 33 | "spreadsheetId": "{spreadsheet_id}" |
37 | } | 34 | } |
38 | } | 35 | } |
39 | 36 | ||
40 | # sheet_metadata: Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet. | 37 | # sheet_metadata: Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet. |
41 | # This endpoint includes detailed metadata about each cell in the header and first data row | 38 | # This endpoint includes detailed metadata about each cell in the header and first data row |
42 | # incl. data type, formatting, etc. | 39 | # incl. data type, formatting, etc. |
40 | # cf https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/get | ||
43 | SHEET_METADATA = { | 41 | SHEET_METADATA = { |
44 | "api": "sheets", | ||
45 | "path": "spreadsheets/{spreadsheet_id}", | ||
46 | "key_properties": ["sheetId"], | 42 | "key_properties": ["sheetId"], |
47 | "replication_method": "FULL_TABLE", | 43 | "replication_method": "FULL_TABLE", |
48 | "params": { | 44 | "params": { |
45 | "spreadsheetId": "{spreadsheet_id}", | ||
49 | "includeGridData": "true", | 46 | "includeGridData": "true", |
50 | "ranges": "'{sheet_title}'!1:2" | 47 | "ranges": "'{sheet_title}'!1:2" |
51 | } | 48 | } |
52 | } | 49 | } |
53 | 50 | ||
54 | # sheets_loaded: Queries a batch of Rows for each Sheet in the Spreadsheet. | 51 | # sheets_loaded: Queries a batch of Rows for each Sheet in the Spreadsheet. |
55 | # Each query uses the `values` endpoint, to get data-only, w/out the formatting/type metadata. | 52 | # Each query uses the `values` endpoint, to get data-only, w/out the formatting/type metadata. |
53 | # cf https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets.values/get | ||
56 | SHEETS_LOADED = { | 54 | SHEETS_LOADED = { |
57 | "api": "sheets", | ||
58 | "path": "spreadsheets/{spreadsheet_id}/values/'{sheet_title}'!{range_rows}", | ||
59 | "data_key": "values", | 55 | "data_key": "values", |
60 | "key_properties": ["spreadsheetId", "sheetId", "loadDate"], | 56 | "key_properties": ["spreadsheetId", "sheetId", "loadDate"], |
61 | "replication_method": "FULL_TABLE", | 57 | "replication_method": "FULL_TABLE", |
62 | "params": { | 58 | "params": { |
59 | "spreadsheetId": "{spreadsheet_id}", | ||
60 | "range": "'{sheet_title}'!{range_rows}", | ||
63 | "dateTimeRenderOption": "SERIAL_NUMBER", | 61 | "dateTimeRenderOption": "SERIAL_NUMBER", |
64 | "valueRenderOption": "UNFORMATTED_VALUE", | 62 | "valueRenderOption": "UNFORMATTED_VALUE", |
65 | "majorDimension": "ROWS" | 63 | "majorDimension": "ROWS" |
diff --git a/tap_google_sheets/sync.py b/tap_google_sheets/sync.py index 26c2d19..c67055a 100644 --- a/tap_google_sheets/sync.py +++ b/tap_google_sheets/sync.py | |||
@@ -141,35 +141,17 @@ def get_selected_fields(catalog, stream_name): | |||
141 | pass | 141 | pass |
142 | return selected_fields | 142 | return selected_fields |
143 | 143 | ||
144 | |||
145 | def get_data(stream_name, | 144 | def get_data(stream_name, |
146 | endpoint_config, | 145 | endpoint_config, |
147 | client, | 146 | client, |
148 | spreadsheet_id, | 147 | **kwargs): |
149 | range_rows=None): | ||
150 | if not range_rows: | ||
151 | range_rows = '' | ||
152 | # Replace {placeholder} variables in path | ||
153 | # Encode stream_name: fixes issue w/ special characters in sheet name | ||
154 | stream_name_escaped = re.escape(stream_name) | ||
155 | stream_name_encoded = urllib.parse.quote_plus(stream_name) | ||
156 | path = endpoint_config.get('path', stream_name).replace( | ||
157 | '{spreadsheet_id}', spreadsheet_id).replace('{sheet_title}', stream_name_encoded).replace( | ||
158 | '{range_rows}', range_rows) | ||
159 | params = endpoint_config.get('params', {}) | 148 | params = endpoint_config.get('params', {}) |
160 | api = endpoint_config.get('api', 'sheets') | 149 | LOGGER.info('GET {}'.format(stream_name)) |
161 | # Add in querystring parameters and replace {placeholder} variables | ||
162 | # querystring function ensures parameters are added but not encoded causing API errors | ||
163 | querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]).replace( | ||
164 | '{sheet_title}', stream_name_encoded) | ||
165 | LOGGER.info('URL: {}/{}?{}'.format(client.base_url, path, querystring)) | ||
166 | data = {} | ||
167 | time_extracted = utils.now() | 150 | time_extracted = utils.now() |
168 | data = client.get( | 151 | data = client.request( |
169 | path=path, | 152 | endpoint=stream_name, |
170 | api=api, | 153 | params=params, |
171 | params=querystring, | 154 | **kwargs) |
172 | endpoint=stream_name_escaped) | ||
173 | return data, time_extracted | 155 | return data, time_extracted |
174 | 156 | ||
175 | 157 | ||
@@ -382,7 +364,7 @@ def sync(client, config, catalog, state): | |||
382 | file_metadata_config = STREAMS.get(stream_name) | 364 | file_metadata_config = STREAMS.get(stream_name) |
383 | 365 | ||
384 | # GET file_metadata | 366 | # GET file_metadata |
385 | LOGGER.info('GET file_meatadata') | 367 | LOGGER.info('GET file_metadata') |
386 | file_metadata, time_extracted = get_data(stream_name=stream_name, | 368 | file_metadata, time_extracted = get_data(stream_name=stream_name, |
387 | endpoint_config=file_metadata_config, | 369 | endpoint_config=file_metadata_config, |
388 | client=client, | 370 | client=client, |
@@ -497,11 +479,12 @@ def sync(client, config, catalog, state): | |||
497 | while not is_last_row and from_row < sheet_max_row and to_row <= sheet_max_row: | 479 | while not is_last_row and from_row < sheet_max_row and to_row <= sheet_max_row: |
498 | range_rows = 'A{}:{}{}'.format(from_row, sheet_last_col_letter, to_row) | 480 | range_rows = 'A{}:{}{}'.format(from_row, sheet_last_col_letter, to_row) |
499 | 481 | ||
500 | # GET sheet_data for a worksheet tab | 482 | # GET sheets_loaded for a worksheet tab |
501 | sheet_data, time_extracted = get_data( | 483 | sheet_data, time_extracted = get_data( |
502 | stream_name=sheet_title, | 484 | stream_name='sheets_loaded', |
503 | endpoint_config=sheets_loaded_config, | 485 | endpoint_config=sheets_loaded_config, |
504 | client=client, | 486 | client=client, |
487 | sheet_title=sheet_title, | ||
505 | spreadsheet_id=spreadsheet_id, | 488 | spreadsheet_id=spreadsheet_id, |
506 | range_rows=range_rows) | 489 | range_rows=range_rows) |
507 | # Data is returned as a list of arrays, an array of values for each row | 490 | # Data is returned as a list of arrays, an array of values for each row |