aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--README.md34
-rw-r--r--config.json.example7
-rwxr-xr-x[-rw-r--r--]setup.py6
-rwxr-xr-x[-rw-r--r--]tap_google_sheets/__init__.py12
-rw-r--r--tap_google_sheets/client.py239
-rw-r--r--tap_google_sheets/schema.py24
-rw-r--r--tap_google_sheets/streams.py28
-rw-r--r--tap_google_sheets/sync.py37
8 files changed, 176 insertions, 211 deletions
diff --git a/README.md b/README.md
index 9470411..04e7667 100644
--- a/README.md
+++ b/README.md
@@ -67,16 +67,21 @@ This tap:
67 - Process/send records to target 67 - Process/send records to target
68 68
69## Authentication 69## Authentication
70The [**Google Sheets Setup & Authentication**](https://drive.google.com/open?id=1FojlvtLwS0-BzGS37R0jEXtwSHqSiO1Uw-7RKQQO-C4) Google Doc provides instructions show how to configure the Google Cloud API credentials to enable Google Drive and Google Sheets APIs, configure Google Cloud to authorize/verify your domain ownership, generate an API key (client_id, client_secret), authenticate and generate a refresh_token, and prepare your tap config.json with the necessary parameters. 70
71- Enable Googe Drive APIs and Authorization Scope: https://www.googleapis.com/auth/drive.metadata.readonly 71You will need a Google developer project to use this tool. After [creating a project](https://console.developers.google.com/projectcreate) (or selecting an existing one) in your Google developers console the authentication can be configured in two different ways:
72- Enable Google Sheets API and Authorization Scope: https://www.googleapis.com/auth/spreadsheets.readonly 72
73- Tap config.json parameters: 73- Via an OAuth client which will ask the user to login to its Google user account.
74 - client_id: identifies your application 74
75 - client_secret: authenticates your application 75 Please check the [“Creating application credentials”](https://github.com/googleapis/google-api-python-client/blob/d0110cf4f7aaa93d6f56fc028cd6a1e3d8dd300a/docs/oauth-installed.md#creating-application-credentials) paragraph of the Google Python library to download your Google credentials file.
76 - refresh_token: generates an access token to authorize your session 76
77 - spreadsheet_id: unique identifier for each spreadsheet in Google Drive 77- Via a Service account (ideal for server-to-server communication)
78 - start_date: absolute minimum start date to check file modified 78
79 - user_agent: tap-name and email address; identifies your application in the Remote API server logs 79 Please check the [“Creating a service account”](https://github.com/googleapis/google-api-python-client/blob/d0110cf4f7aaa93d6f56fc028cd6a1e3d8dd300a/docs/oauth-server.md#creating-a-service-account) paragraph of the Google Python library to download your Google Service Account key file.
80
81- Tap `config.json` parameters:
82 - `credentials_file`: the path to a valid Google credentials file (Either an OAuth client secrets file or a Service Account key file)
83 - `spreadsheet_id`: unique identifier for each spreadsheet in Google Drive
84 - `start_date`: absolute minimum start date to check file modified
80 85
81## Quick Start 86## Quick Start
82 87
@@ -103,16 +108,13 @@ The [**Google Sheets Setup & Authentication**](https://drive.google.com/open?id=
103 - [singer-tools](https://github.com/singer-io/singer-tools) 108 - [singer-tools](https://github.com/singer-io/singer-tools)
104 - [target-stitch](https://github.com/singer-io/target-stitch) 109 - [target-stitch](https://github.com/singer-io/target-stitch)
105 110
1063. Create your tap's `config.json` file. Include the client_id, client_secret, refresh_token, site_urls (website URL properties in a comma delimited list; do not include the domain-level property in the list), start_date (UTC format), and user_agent (tap name with the api user email address). 1113. Create your tap's `config.json` file. Include the `credentials_file` path to your google secrets file as described in the [Authentication](#authentication) paragraph.
107 112
108 ```json 113 ```json
109 { 114 {
110 "client_id": "YOUR_CLIENT_ID", 115 "credentials_file": "PATH_TO_YOUR_GOOGLE_CREDENTIALS_FILE",
111 "client_secret": "YOUR_CLIENT_SECRET",
112 "refresh_token": "YOUR_REFRESH_TOKEN",
113 "spreadsheet_id": "YOUR_GOOGLE_SPREADSHEET_ID", 116 "spreadsheet_id": "YOUR_GOOGLE_SPREADSHEET_ID",
114 "start_date": "2019-01-01T00:00:00Z", 117 "start_date": "2019-01-01T00:00:00Z"
115 "user_agent": "tap-google-sheets <api_user_email@example.com>"
116 } 118 }
117 ``` 119 ```
118 120
diff --git a/config.json.example b/config.json.example
index 159aafb..fba0b17 100644
--- a/config.json.example
+++ b/config.json.example
@@ -1,8 +1,5 @@
1{ 1{
2 "client_id": "YOUR_CLIENT_ID", 2 "credentials_file": "client-secrets.json",
3 "client_secret": "YOUR_CLIENT_SECRET",
4 "refresh_token": "YOUR_REFRESH_TOKEN",
5 "spreadsheet_id": "YOUR_GOOGLE_SPREADSHEET_ID", 3 "spreadsheet_id": "YOUR_GOOGLE_SPREADSHEET_ID",
6 "start_date": "2019-01-01T00:00:00Z", 4 "start_date": "2019-01-01T00:00:00Z"
7 "user_agent": "tap-google-search-console <api_user_email@example.com>"
8} 5}
diff --git a/setup.py b/setup.py
index f3fd2c3..ddea949 100644..100755
--- a/setup.py
+++ b/setup.py
@@ -10,8 +10,10 @@ setup(name='tap-google-sheets',
10 py_modules=['tap_google_sheets'], 10 py_modules=['tap_google_sheets'],
11 install_requires=[ 11 install_requires=[
12 'backoff==1.8.0', 12 'backoff==1.8.0',
13 'requests==2.22.0', 13 'singer-python==5.9.0',
14 'singer-python==5.9.0' 14 'google-api-python-client==1.12.5',
15 'google-auth==1.23.0',
16 'google-auth-oauthlib==0.4.2',
15 ], 17 ],
16 extras_require={ 18 extras_require={
17 'dev': [ 19 'dev': [
diff --git a/tap_google_sheets/__init__.py b/tap_google_sheets/__init__.py
index f97d4b8..15db05f 100644..100755
--- a/tap_google_sheets/__init__.py
+++ b/tap_google_sheets/__init__.py
@@ -12,12 +12,9 @@ from tap_google_sheets.sync import sync
12LOGGER = singer.get_logger() 12LOGGER = singer.get_logger()
13 13
14REQUIRED_CONFIG_KEYS = [ 14REQUIRED_CONFIG_KEYS = [
15 'client_id', 15 'credentials_file',
16 'client_secret',
17 'refresh_token',
18 'spreadsheet_id', 16 'spreadsheet_id',
19 'start_date', 17 'start_date'
20 'user_agent'
21] 18]
22 19
23def do_discover(client, spreadsheet_id): 20def do_discover(client, spreadsheet_id):
@@ -33,10 +30,7 @@ def main():
33 30
34 parsed_args = singer.utils.parse_args(REQUIRED_CONFIG_KEYS) 31 parsed_args = singer.utils.parse_args(REQUIRED_CONFIG_KEYS)
35 32
36 with GoogleClient(parsed_args.config['client_id'], 33 with GoogleClient(parsed_args.config['credentials_file']) as client:
37 parsed_args.config['client_secret'],
38 parsed_args.config['refresh_token'],
39 parsed_args.config['user_agent']) as client:
40 34
41 state = {} 35 state = {}
42 if parsed_args.state: 36 if parsed_args.state:
diff --git a/tap_google_sheets/client.py b/tap_google_sheets/client.py
index 4f38352..0cbad98 100644
--- a/tap_google_sheets/client.py
+++ b/tap_google_sheets/client.py
@@ -1,16 +1,21 @@
1from datetime import datetime, timedelta 1from datetime import datetime, timedelta
2from collections import OrderedDict 2from collections import OrderedDict
3import backoff 3import backoff
4import requests
5import singer 4import singer
5import logging
6import pickle
7import json
8import os
6from singer import metrics 9from singer import metrics
7from singer import utils 10from singer import utils
11from google.oauth2 import service_account
12from google_auth_oauthlib.flow import InstalledAppFlow
13from google.auth.transport.requests import Request
14from googleapiclient.errors import HttpError
15import googleapiclient.discovery
8 16
9BASE_URL = 'https://www.googleapis.com'
10GOOGLE_TOKEN_URI = 'https://oauth2.googleapis.com/token'
11LOGGER = singer.get_logger() 17LOGGER = singer.get_logger()
12 18
13
14class Server5xxError(Exception): 19class Server5xxError(Exception):
15 pass 20 pass
16 21
@@ -101,90 +106,85 @@ ERROR_CODE_EXCEPTION_MAPPING = {
101 428: GooglePreconditionRequiredError, 106 428: GooglePreconditionRequiredError,
102 500: GoogleInternalServiceError} 107 500: GoogleInternalServiceError}
103 108
104
105def get_exception_for_error_code(error_code):
106 return ERROR_CODE_EXCEPTION_MAPPING.get(error_code, GoogleError)
107
108def raise_for_error(response):
109 try:
110 response.raise_for_status()
111 except (requests.HTTPError, requests.ConnectionError) as error:
112 try:
113 content_length = len(response.content)
114 if content_length == 0:
115 # There is nothing we can do here since Google has neither sent
116 # us a 2xx response nor a response content.
117 return
118 response = response.json()
119 if ('error' in response) or ('errorCode' in response):
120 message = '%s: %s' % (response.get('error', str(error)),
121 response.get('message', 'Unknown Error'))
122 error_code = response.get('error', {}).get('code')
123 ex = get_exception_for_error_code(error_code)
124 raise ex(message)
125 raise GoogleError(error)
126 except (ValueError, TypeError):
127 raise GoogleError(error)
128
129class GoogleClient: # pylint: disable=too-many-instance-attributes 109class GoogleClient: # pylint: disable=too-many-instance-attributes
130 def __init__(self, 110 SCOPES = [
131 client_id, 111 "https://www.googleapis.com/auth/drive.metadata.readonly",
132 client_secret, 112 "https://www.googleapis.com/auth/spreadsheets.readonly"
133 refresh_token, 113 ]
134 user_agent=None): 114
135 self.__client_id = client_id 115 def __init__(self, credentials_file):
136 self.__client_secret = client_secret 116 self.__credentials = self.fetchCredentials(credentials_file)
137 self.__refresh_token = refresh_token 117 self.__sheets_service = googleapiclient.discovery.build(
138 self.__user_agent = user_agent 118 'sheets',
139 self.__access_token = None 119 'v4',
140 self.__expires = None 120 credentials=self.__credentials,
141 self.__session = requests.Session() 121 cache_discovery=False
142 self.base_url = None 122 )
143 123 self.__drive_service = googleapiclient.discovery.build(
124 'drive',
125 'v3',
126 credentials=self.__credentials,
127 cache_discovery=False
128 )
129
130 def fetchCredentials(self, credentials_file):
131 LOGGER.debug('authenticate with google')
132 data = None
133
134 # Check a credentials file exist
135 if not os.path.exists(credentials_file):
136 raise Exception("The configured Google credentials file {} doesn't exist".format(credentials_file))
137
138 # Load credentials json file
139 with open(credentials_file) as json_file:
140 data = json.load(json_file)
141
142 if data.get('type', '') == 'service_account':
143 return self.fetchServiceAccountCredentials(credentials_file)
144 elif data.get('installed'):
145 return self.fetchInstalledOAuthCredentials(credentials_file)
146 else:
147 raise Exception("""This Google credentials file is not yet recognize.
148
149 Please use either:
150 - a Service Account (https://github.com/googleapis/google-api-python-client/blob/d0110cf4f7aaa93d6f56fc028cd6a1e3d8dd300a/docs/oauth-server.md)
151 - an installed OAuth client (https://github.com/googleapis/google-api-python-client/blob/d0110cf4f7aaa93d6f56fc028cd6a1e3d8dd300a/docs/oauth-installed.md)"""
152 )
153
154 def fetchServiceAccountCredentials(self, credentials_file):
155 # The service account credentials file can be used for server-to-server applications
156 return service_account.Credentials.from_service_account_file(
157 credentials_file, scopes=GoogleClient.SCOPES)
158
159 def fetchInstalledOAuthCredentials(self, credentials_file):
160 creds = None
161
162 # The file token.pickle stores the user's access and refresh tokens, and is
163 # created automatically when the authorization flow completes for the first
164 # time.
165 if os.path.exists('token.pickle'):
166 with open('token.pickle', 'rb') as token:
167 creds = pickle.load(token)
168
169 # If there are no (valid) credentials available, let the user log in.
170 if not creds or not creds.valid:
171 if creds and creds.expired and creds.refresh_token:
172 creds.refresh(Request())
173 else:
174 flow = InstalledAppFlow.from_client_secrets_file(
175 credentials_file, GoogleClient.SCOPES)
176 creds = flow.run_local_server(port=0)
177 # Save the credentials for the next run
178 with open('token.pickle', 'wb') as token:
179 pickle.dump(creds, token)
180
181 return creds
144 182
145 def __enter__(self): 183 def __enter__(self):
146 self.get_access_token()
147 return self 184 return self
148 185
149 def __exit__(self, exception_type, exception_value, traceback): 186 def __exit__(self, exception_type, exception_value, traceback):
150 self.__session.close() 187 LOGGER.debug('exiting google client')
151
152
153 @backoff.on_exception(backoff.expo,
154 Server5xxError,
155 max_tries=5,
156 factor=2)
157 def get_access_token(self):
158 # The refresh_token never expires and may be used many times to generate each access_token
159 # Since the refresh_token does not expire, it is not included in get access_token response
160 if self.__access_token is not None and self.__expires > datetime.utcnow():
161 return
162
163 headers = {}
164 if self.__user_agent:
165 headers['User-Agent'] = self.__user_agent
166
167 response = self.__session.post(
168 url=GOOGLE_TOKEN_URI,
169 headers=headers,
170 data={
171 'grant_type': 'refresh_token',
172 'client_id': self.__client_id,
173 'client_secret': self.__client_secret,
174 'refresh_token': self.__refresh_token,
175 })
176
177 if response.status_code >= 500:
178 raise Server5xxError()
179
180 if response.status_code != 200:
181 raise_for_error(response)
182
183 data = response.json()
184 self.__access_token = data['access_token']
185 self.__expires = datetime.utcnow() + timedelta(seconds=data['expires_in'])
186 LOGGER.info('Authorized, token expires = {}'.format(self.__expires))
187
188 188
189 # Rate Limit: https://developers.google.com/sheets/api/limits 189 # Rate Limit: https://developers.google.com/sheets/api/limits
190 # 100 request per 100 seconds per User 190 # 100 request per 100 seconds per User
@@ -193,53 +193,48 @@ class GoogleClient: # pylint: disable=too-many-instance-attributes
193 max_tries=7, 193 max_tries=7,
194 factor=3) 194 factor=3)
195 @utils.ratelimit(100, 100) 195 @utils.ratelimit(100, 100)
196 def request(self, method, path=None, url=None, api=None, **kwargs): 196 def request(self, endpoint=None, params={}, **kwargs):
197 self.get_access_token() 197 formatted_params = {}
198 self.base_url = 'https://sheets.googleapis.com/v4' 198 for (key, value) in params.items():
199 if api == 'files': 199 # API parameters interpolation
200 self.base_url = 'https://www.googleapis.com/drive/v3' 200 # will raise a KeyError in case a necessary argument is missing
201 201 formatted_params[key] = value.format(**kwargs)
202 if not url and path: 202
203 url = '{}/{}'.format(self.base_url, path) 203 # Call the correct Google API depending on the stream name
204 204 if endpoint == 'spreadsheet_metadata' or endpoint == 'sheet_metadata':
205 # endpoint = stream_name (from sync.py API call) 205 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/get
206 if 'endpoint' in kwargs: 206 request = self.__sheets_service.spreadsheets().get(**formatted_params)
207 endpoint = kwargs['endpoint'] 207 elif endpoint == 'sheets_loaded':
208 del kwargs['endpoint'] 208 # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets.values/get
209 request = self.__sheets_service.spreadsheets().values().get(**formatted_params)
210 elif endpoint == 'file_metadata':
211 # https://developers.google.com/drive/api/v3/reference/files/get
212 request = self.__drive_service.files().get(**formatted_params)
209 else: 213 else:
210 endpoint = None 214 raise Exception('{} not implemented yet!'.format(endpoint))
211 LOGGER.info('{} URL = {}'.format(endpoint, url))
212
213 if 'headers' not in kwargs:
214 kwargs['headers'] = {}
215 kwargs['headers']['Authorization'] = 'Bearer {}'.format(self.__access_token)
216 215
217 if self.__user_agent: 216 with metrics.http_request_timer(endpoint) as timer:
218 kwargs['headers']['User-Agent'] = self.__user_agent 217 error = None
218 status_code = 400
219 219
220 if method == 'POST': 220 try:
221 kwargs['headers']['Content-Type'] = 'application/json' 221 response = request.execute()
222 status_code = 200
223 except HttpError as e:
224 status_code = e.resp.status or status_code
225 error = e
222 226
223 with metrics.http_request_timer(endpoint) as timer: 227 timer.tags[metrics.Tag.http_status_code] = status_code
224 response = self.__session.request(method, url, **kwargs)
225 timer.tags[metrics.Tag.http_status_code] = response.status_code
226 228
227 if response.status_code >= 500: 229 if status_code >= 500:
228 raise Server5xxError() 230 raise Server5xxError()
229 231
230 #Use retry functionality in backoff to wait and retry if 232 # Use retry functionality in backoff to wait and retry if
231 #response code equals 429 because rate limit has been exceeded 233 # response code equals 429 because rate limit has been exceeded
232 if response.status_code == 429: 234 if status_code == 429:
233 raise Server429Error() 235 raise Server429Error()
234 236
235 if response.status_code != 200: 237 if status_code != 200:
236 raise_for_error(response) 238 raise error
237
238 # Ensure keys and rows are ordered as received from API
239 return response.json(object_pairs_hook=OrderedDict)
240
241 def get(self, path, api, **kwargs):
242 return self.request(method='GET', path=path, api=api, **kwargs)
243 239
244 def post(self, path, api, **kwargs): 240 return response
245 return self.request(method='POST', path=path, api=api, **kwargs)
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py
index fcaccf9..56d2fb9 100644
--- a/tap_google_sheets/schema.py
+++ b/tap_google_sheets/schema.py
@@ -224,16 +224,13 @@ def get_sheet_metadata(sheet, spreadsheet_id, client):
224 224
225 stream_name = 'sheet_metadata' 225 stream_name = 'sheet_metadata'
226 stream_metadata = STREAMS.get(stream_name) 226 stream_metadata = STREAMS.get(stream_name)
227 api = stream_metadata.get('api', 'sheets')
228 params = stream_metadata.get('params', {}) 227 params = stream_metadata.get('params', {})
229 sheet_title_encoded = urllib.parse.quote_plus(sheet_title) 228
230 sheet_title_escaped = re.escape(sheet_title) 229 # GET sheet_metadata
231 querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \ 230 sheet_md_results = client.request(endpoint=stream_name,
232 params.items()]).replace('{sheet_title}', sheet_title_encoded) 231 spreadsheet_id=spreadsheet_id,
233 path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ 232 sheet_title=sheet_title,
234 spreadsheet_id), querystring) 233 params=params)
235
236 sheet_md_results = client.get(path=path, api=api, endpoint=sheet_title_escaped)
237 # sheet_metadata: 1st `sheets` node in results 234 # sheet_metadata: 1st `sheets` node in results
238 sheet_metadata = sheet_md_results.get('sheets')[0] 235 sheet_metadata = sheet_md_results.get('sheets')[0]
239 236
@@ -275,15 +272,12 @@ def get_schemas(client, spreadsheet_id):
275 field_metadata[stream_name] = mdata 272 field_metadata[stream_name] = mdata
276 273
277 if stream_name == 'spreadsheet_metadata': 274 if stream_name == 'spreadsheet_metadata':
278 api = stream_metadata.get('api', 'sheets')
279 params = stream_metadata.get('params', {}) 275 params = stream_metadata.get('params', {})
280 querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()])
281 path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
282 spreadsheet_id), querystring)
283 276
284 # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet) 277 # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet)
285 spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \ 278 spreadsheet_md_results = client.request(endpoint=stream_name,
286 endpoint=stream_name) 279 spreadsheet_id=spreadsheet_id,
280 params=params)
287 281
288 sheets = spreadsheet_md_results.get('sheets') 282 sheets = spreadsheet_md_results.get('sheets')
289 if sheets: 283 if sheets:
diff --git a/tap_google_sheets/streams.py b/tap_google_sheets/streams.py
index ad5529f..f7bf8ac 100644
--- a/tap_google_sheets/streams.py
+++ b/tap_google_sheets/streams.py
@@ -2,9 +2,7 @@ from collections import OrderedDict
2 2
3# streams: API URL endpoints to be called 3# streams: API URL endpoints to be called
4# properties: 4# properties:
5# <root node>: Plural stream name for the endpoint 5# <root node>: Plural stream name which will condition the endpoint called
6# path: API endpoint relative path, when added to the base URL, creates the full path,
7# default = stream_name
8# key_properties: Primary key fields for identifying an endpoint record. 6# key_properties: Primary key fields for identifying an endpoint record.
9# replication_method: INCREMENTAL or FULL_TABLE 7# replication_method: INCREMENTAL or FULL_TABLE
10# replication_keys: bookmark_field(s), typically a date-time, used for filtering the results 8# replication_keys: bookmark_field(s), typically a date-time, used for filtering the results
@@ -15,51 +13,51 @@ from collections import OrderedDict
15 13
16# file_metadata: Queries Google Drive API to get file information and see if file has been modified 14# file_metadata: Queries Google Drive API to get file information and see if file has been modified
17# Provides audit info about who and when last changed the file. 15# Provides audit info about who and when last changed the file.
16# cf https://developers.google.com/drive/api/v3/reference/files/get
18FILE_METADATA = { 17FILE_METADATA = {
19 "api": "files",
20 "path": "files/{spreadsheet_id}",
21 "key_properties": ["id"], 18 "key_properties": ["id"],
22 "replication_method": "INCREMENTAL", 19 "replication_method": "INCREMENTAL",
23 "replication_keys": ["modifiedTime"], 20 "replication_keys": ["modifiedTime"],
24 "params": { 21 "params": {
22 "fileId": "{spreadsheet_id}",
25 "fields": "id,name,createdTime,modifiedTime,version,teamDriveId,driveId,lastModifyingUser" 23 "fields": "id,name,createdTime,modifiedTime,version,teamDriveId,driveId,lastModifyingUser"
26 } 24 }
27} 25}
28 26
29# spreadsheet_metadata: Queries spreadsheet to get basic information on spreadhsheet and sheets 27# spreadsheet_metadata: Queries spreadsheet to get basic information on spreadhsheet and sheets
28# cf https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/get
30SPREADSHEET_METADATA = { 29SPREADSHEET_METADATA = {
31 "api": "sheets",
32 "path": "spreadsheets/{spreadsheet_id}",
33 "key_properties": ["spreadsheetId"], 30 "key_properties": ["spreadsheetId"],
34 "replication_method": "FULL_TABLE", 31 "replication_method": "FULL_TABLE",
35 "params": { 32 "params": {
36 "includeGridData": "false" 33 "spreadsheetId": "{spreadsheet_id}"
37 } 34 }
38} 35}
39 36
40# sheet_metadata: Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet. 37# sheet_metadata: Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet.
41# This endpoint includes detailed metadata about each cell in the header and first data row 38# This endpoint includes detailed metadata about each cell in the header and first data row
42# incl. data type, formatting, etc. 39# incl. data type, formatting, etc.
40# cf https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/get
43SHEET_METADATA = { 41SHEET_METADATA = {
44 "api": "sheets",
45 "path": "spreadsheets/{spreadsheet_id}",
46 "key_properties": ["sheetId"], 42 "key_properties": ["sheetId"],
47 "replication_method": "FULL_TABLE", 43 "replication_method": "FULL_TABLE",
48 "params": { 44 "params": {
45 "spreadsheetId": "{spreadsheet_id}",
49 "includeGridData": "true", 46 "includeGridData": "true",
50 "ranges": "'{sheet_title}'!1:2" 47 "ranges": "'{sheet_title}'!1:2"
51 } 48 }
52} 49}
53 50
54# sheets_loaded: Queries a batch of Rows for each Sheet in the Spreadsheet. 51# sheets_loaded: Queries a batch of Rows for each Sheet in the Spreadsheet.
55# Each query uses the `values` endpoint, to get data-only, w/out the formatting/type metadata. 52# Each query uses the `values` endpoint, to get data-only, w/out the formatting/type metadata.
53# cf https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets.values/get
56SHEETS_LOADED = { 54SHEETS_LOADED = {
57 "api": "sheets",
58 "path": "spreadsheets/{spreadsheet_id}/values/'{sheet_title}'!{range_rows}",
59 "data_key": "values", 55 "data_key": "values",
60 "key_properties": ["spreadsheetId", "sheetId", "loadDate"], 56 "key_properties": ["spreadsheetId", "sheetId", "loadDate"],
61 "replication_method": "FULL_TABLE", 57 "replication_method": "FULL_TABLE",
62 "params": { 58 "params": {
59 "spreadsheetId": "{spreadsheet_id}",
60 "range": "'{sheet_title}'!{range_rows}",
63 "dateTimeRenderOption": "SERIAL_NUMBER", 61 "dateTimeRenderOption": "SERIAL_NUMBER",
64 "valueRenderOption": "UNFORMATTED_VALUE", 62 "valueRenderOption": "UNFORMATTED_VALUE",
65 "majorDimension": "ROWS" 63 "majorDimension": "ROWS"
diff --git a/tap_google_sheets/sync.py b/tap_google_sheets/sync.py
index 26c2d19..c67055a 100644
--- a/tap_google_sheets/sync.py
+++ b/tap_google_sheets/sync.py
@@ -141,35 +141,17 @@ def get_selected_fields(catalog, stream_name):
141 pass 141 pass
142 return selected_fields 142 return selected_fields
143 143
144
145def get_data(stream_name, 144def get_data(stream_name,
146 endpoint_config, 145 endpoint_config,
147 client, 146 client,
148 spreadsheet_id, 147 **kwargs):
149 range_rows=None):
150 if not range_rows:
151 range_rows = ''
152 # Replace {placeholder} variables in path
153 # Encode stream_name: fixes issue w/ special characters in sheet name
154 stream_name_escaped = re.escape(stream_name)
155 stream_name_encoded = urllib.parse.quote_plus(stream_name)
156 path = endpoint_config.get('path', stream_name).replace(
157 '{spreadsheet_id}', spreadsheet_id).replace('{sheet_title}', stream_name_encoded).replace(
158 '{range_rows}', range_rows)
159 params = endpoint_config.get('params', {}) 148 params = endpoint_config.get('params', {})
160 api = endpoint_config.get('api', 'sheets') 149 LOGGER.info('GET {}'.format(stream_name))
161 # Add in querystring parameters and replace {placeholder} variables
162 # querystring function ensures parameters are added but not encoded causing API errors
163 querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]).replace(
164 '{sheet_title}', stream_name_encoded)
165 LOGGER.info('URL: {}/{}?{}'.format(client.base_url, path, querystring))
166 data = {}
167 time_extracted = utils.now() 150 time_extracted = utils.now()
168 data = client.get( 151 data = client.request(
169 path=path, 152 endpoint=stream_name,
170 api=api, 153 params=params,
171 params=querystring, 154 **kwargs)
172 endpoint=stream_name_escaped)
173 return data, time_extracted 155 return data, time_extracted
174 156
175 157
@@ -382,7 +364,7 @@ def sync(client, config, catalog, state):
382 file_metadata_config = STREAMS.get(stream_name) 364 file_metadata_config = STREAMS.get(stream_name)
383 365
384 # GET file_metadata 366 # GET file_metadata
385 LOGGER.info('GET file_meatadata') 367 LOGGER.info('GET file_metadata')
386 file_metadata, time_extracted = get_data(stream_name=stream_name, 368 file_metadata, time_extracted = get_data(stream_name=stream_name,
387 endpoint_config=file_metadata_config, 369 endpoint_config=file_metadata_config,
388 client=client, 370 client=client,
@@ -497,11 +479,12 @@ def sync(client, config, catalog, state):
497 while not is_last_row and from_row < sheet_max_row and to_row <= sheet_max_row: 479 while not is_last_row and from_row < sheet_max_row and to_row <= sheet_max_row:
498 range_rows = 'A{}:{}{}'.format(from_row, sheet_last_col_letter, to_row) 480 range_rows = 'A{}:{}{}'.format(from_row, sheet_last_col_letter, to_row)
499 481
500 # GET sheet_data for a worksheet tab 482 # GET sheets_loaded for a worksheet tab
501 sheet_data, time_extracted = get_data( 483 sheet_data, time_extracted = get_data(
502 stream_name=sheet_title, 484 stream_name='sheets_loaded',
503 endpoint_config=sheets_loaded_config, 485 endpoint_config=sheets_loaded_config,
504 client=client, 486 client=client,
487 sheet_title=sheet_title,
505 spreadsheet_id=spreadsheet_id, 488 spreadsheet_id=spreadsheet_id,
506 range_rows=range_rows) 489 range_rows=range_rows)
507 # Data is returned as a list of arrays, an array of values for each row 490 # Data is returned as a list of arrays, an array of values for each row