tap_google_sheets/streams.py

   1 from collections import OrderedDict
   2
   3 # streams: API URL endpoints to be called
   4 # properties:
   5 #   <root node>: Plural stream name for the endpoint
   6 #   path: API endpoint relative path, when added to the base URL, creates the full path,
   7 #       default = stream_name
   8 #   key_properties: Primary key fields for identifying an endpoint record.
   9 #   replication_method: INCREMENTAL or FULL_TABLE
  10 #   replication_keys: bookmark_field(s), typically a date-time, used for filtering the results
  11 #       and setting the state
  12 #   params: Query, sort, and other endpoint specific parameters; default = {}
  13 #   data_key: JSON element containing the results list for the endpoint;
  14 #       default = root (no data_key)
  15
  16 # file_metadata: Queries Google Drive API to get file information and see if file has been modified
  17 #    Provides audit info about who and when last changed the file.
  18 FILE_METADATA = {
  19     "api": "files",
  20     "path": "files/{spreadsheet_id}",
  21     "key_properties": ["id"],
  22     "replication_method": "INCREMENTAL",
  23     "replication_keys": ["modifiedTime"],
  24     "params": {
  25         "fields": "id,name,createdTime,modifiedTime,version,teamDriveId,driveId,lastModifyingUser"
  26     }
  27 }
  28
  29 # spreadsheet_metadata: Queries spreadsheet to get basic information on spreadhsheet and sheets
  30 SPREADSHEET_METADATA = {
  31     "api": "sheets",
  32     "path": "spreadsheets/{spreadsheet_id}",
  33     "key_properties": ["spreadsheetId"],
  34     "replication_method": "FULL_TABLE",
  35     "params": {
  36         "includeGridData": "false"
  37     }
  38 }
  39
  40 # sheet_metadata: Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet.
  41 # This endpoint includes detailed metadata about each cell in the header and first data row
  42 #   incl. data type, formatting, etc.
  43 SHEET_METADATA = {
  44     "api": "sheets",
  45     "path": "spreadsheets/{spreadsheet_id}",
  46     "key_properties": ["sheetId"],
  47     "replication_method": "FULL_TABLE",
  48     "params": {
  49         "includeGridData": "true",
  50         "ranges": "'{sheet_title}'!1:2"
  51     }
  52 }
  53
  54 # sheets_loaded: Queries a batch of Rows for each Sheet in the Spreadsheet.
  55 # Each query uses the `values` endpoint, to get data-only, w/out the formatting/type metadata.
  56 SHEETS_LOADED = {
  57     "api": "sheets",
  58     "path": "spreadsheets/{spreadsheet_id}/values/'{sheet_title}'!{range_rows}",
  59     "data_key": "values",
  60     "key_properties": ["spreadsheetId", "sheetId", "loadDate"],
  61     "replication_method": "FULL_TABLE",
  62     "params": {
  63         "dateTimeRenderOption": "SERIAL_NUMBER",
  64         "valueRenderOption": "UNFORMATTED_VALUE",
  65         "majorDimension": "ROWS"
  66     }
  67 }
  68
  69 # Ensure streams are ordered sequentially, logically.
  70 STREAMS = OrderedDict()
  71 STREAMS['file_metadata'] = FILE_METADATA
  72 STREAMS['spreadsheet_metadata'] = SPREADSHEET_METADATA
  73 STREAMS['sheet_metadata'] = SHEET_METADATA
  74 STREAMS['sheets_loaded'] = SHEETS_LOADED