tap_google_sheets/schema.py

   1 import os
   2 import json
   3 from collections import OrderedDict
   4 import singer
   5 from singer import metadata
   6 from tap_google_sheets.streams import STREAMS
   7
   8 LOGGER = singer.get_logger()
   9
  10 # Reference:
  11 # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata
  12
  13 # Convert column index to column letter
  14 def colnum_string(num):
  15     string = ""
  16     while num > 0:
  17         num, remainder = divmod(num - 1, 26)
  18         string = chr(65 + remainder) + string
  19     return string
  20
  21
  22 # Create sheet_metadata_json with columns from sheet
  23 def get_sheet_schema_columns(sheet):
  24     sheet_json_schema = OrderedDict()
  25     data = next(iter(sheet.get('data', [])), {})
  26     row_data = data.get('rowData', [])
  27     # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
  28
  29     headers = row_data[0].get('values', [])
  30     first_values = row_data[1].get('values', [])
  31     # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
  32
  33     sheet_json_schema = {
  34         'type': 'object',
  35         'additionalProperties': False,
  36         'properties': {
  37             '__sdc_spreadsheet_id': {
  38                 'type': ['null', 'string']
  39             },
  40             '__sdc_sheet_id': {
  41                 'type': ['null', 'integer']
  42             },
  43             '__sdc_row': {
  44                 'type': ['null', 'integer']
  45             }
  46         }
  47     }
  48
  49     header_list = [] # used for checking uniqueness
  50     columns = []
  51     prior_header = None
  52     i = 0
  53     skipped = 0
  54     # Read column headers until end or 2 consecutive skipped headers
  55     for header in headers:
  56         # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
  57         column_index = i + 1
  58         column_letter = colnum_string(column_index)
  59         header_value = header.get('formattedValue')
  60         if header_value: # NOT skipped
  61             column_is_skipped = False
  62             skipped = 0
  63             column_name = '{}'.format(header_value)
  64             if column_name in header_list:
  65                 raise Exception('DUPLICATE HEADER ERROR: {}'.format(column_name))
  66             header_list.append(column_name)
  67
  68             first_value = first_values[i]
  69
  70             column_effective_value = first_value.get('effectiveValue', {})
  71             for key in column_effective_value.keys():
  72                 if key in ('numberValue', 'stringValue', 'boolValue', 'errorType', 'formulaType'):
  73                     column_effective_value_type = key
  74
  75             column_number_format = first_values[i].get('effectiveFormat', {}).get(
  76                 'numberFormat', {})
  77             column_number_format_type = column_number_format.get('type')
  78
  79             # Determine datatype for sheet_json_schema
  80             #
  81             # column_effective_value_type = numberValue, stringValue, boolValue;
  82             #  INVALID: errorType, formulaType
  83             #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
  84             #
  85             # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
  86             #   TIME, DATE_TIME, SCIENTIFIC
  87             #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
  88             #
  89             column_format = None # Default
  90             if column_effective_value_type == 'stringValue':
  91                 col_properties = {'type': ['null', 'string']}
  92                 column_gs_type = 'stringValue'
  93             elif column_effective_value_type == 'boolValue':
  94                 col_properties = {'type': ['null', 'boolean', 'string']}
  95                 column_gs_type = 'boolValue'
  96             elif column_effective_value_type == 'numberValue':
  97                 if column_number_format_type == 'DATE_TIME':
  98                     col_properties = {
  99                         'type': ['null', 'string'],
 100                         'format': 'date-time'
 101                     }
 102                     column_gs_type = 'numberType.DATE_TIME'
 103                 elif column_number_format_type == 'DATE':
 104                     col_properties = {
 105                         'type': ['null', 'string'],
 106                         'format': 'date'
 107                     }
 108                     column_gs_type = 'numberType.DATE'
 109                 elif column_number_format_type == 'TIME':
 110                     col_properties = {
 111                         'type': ['null', 'string'],
 112                         'format': 'time'
 113                     }
 114                     column_gs_type = 'numberType.TIME'
 115                 elif column_number_format_type == 'TEXT':
 116                     col_properties = {'type': ['null', 'string']}
 117                     column_gs_type = 'stringValue'
 118                 else:
 119                     # Interesting - order in the anyOf makes a difference.
 120                     # Number w/ multipleOf must be listed last, otherwise errors occur.
 121                     col_properties =  {
 122                         'anyOf': [
 123                             {
 124                                 'type': 'string'
 125                             },
 126                             {
 127                                 'type': 'null'
 128                             },
 129                             {
 130                                 'type': 'number',
 131                                 'multipleOf': 1e-15
 132                             }
 133                         ]
 134                     }
 135                     column_gs_type = 'numberType'
 136             # Catch-all to deal with other types and set to string
 137             # column_effective_value_type: formulaValue, errorValue, or other
 138             else:
 139                 col_properties = {'type': ['null', 'string']}
 140                 column_gs_type = 'unsupportedValue'
 141                 LOGGER.info('Unsupported data type: {}, value: {}'.format(column_name, \
 142                     column_effective_value_type))
 143                 LOGGER.info('Converting to string.')
 144         else: # skipped
 145             column_is_skipped = True
 146             skipped = skipped + 1
 147             column_index_str = str(column_index).zfill(2)
 148             column_name = '__sdc_skip_col_{}'.format(column_index_str)
 149             col_properties = {'type': ['null', 'string']}
 150             column_gs_type = 'stringValue'
 151
 152         if skipped >= 2:
 153             # skipped = 2 consecutive skipped headers
 154             # Remove prior_header column_name
 155             sheet_json_schema['properties'].pop(prior_header, None)
 156             break
 157
 158         else:
 159             column = {}
 160             column = {
 161                 'columnIndex': column_index,
 162                 'columnLetter': column_letter,
 163                 'columnName': column_name,
 164                 'columnType': column_gs_type,
 165                 'columnSkipped': column_is_skipped
 166             }
 167             columns.append(column)
 168
 169             sheet_json_schema['properties'][column_name] = col_properties
 170
 171         prior_header = column_name
 172         i = i + 1
 173
 174     return sheet_json_schema, columns
 175
 176
 177 # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query
 178 #   endpoint: spreadsheets/{spreadsheet_id}
 179 #   params: includeGridData = true, ranges = '{sheet_title}'!1:2
 180 # This endpoint includes detailed metadata about each cell - incl. data type, formatting, etc.
 181 def get_sheet_metadata(sheet, spreadsheet_id, client):
 182     sheet_id = sheet.get('properties', {}).get('sheetId')
 183     sheet_title = sheet.get('properties', {}).get('title')
 184     LOGGER.info('sheet_id = {}, sheet_title = {}'.format(sheet_id, sheet_title))
 185
 186     stream_name = 'sheet_metadata'
 187     stream_metadata = STREAMS.get(stream_name)
 188     api = stream_metadata.get('api', 'sheets')
 189     params = stream_metadata.get('params', {})
 190     querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \
 191         params.items()]).replace('{sheet_title}', sheet_title)
 192     path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
 193         spreadsheet_id), querystring)
 194
 195     sheet_md_results = client.get(path=path, api=api, endpoint=stream_name)
 196     # sheet_metadata: 1st `sheets` node in results
 197     sheet_metadata = sheet_md_results.get('sheets')[0]
 198
 199     # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results)
 200     sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata)
 201
 202     return sheet_json_schema, columns
 203
 204
 205 def get_abs_path(path):
 206     return os.path.join(os.path.dirname(os.path.realpath(__file__)), path)
 207
 208 def get_schemas(client, spreadsheet_id):
 209     schemas = {}
 210     field_metadata = {}
 211
 212     for stream_name, stream_metadata in STREAMS.items():
 213         schema_path = get_abs_path('schemas/{}.json'.format(stream_name))
 214         with open(schema_path) as file:
 215             schema = json.load(file)
 216         schemas[stream_name] = schema
 217         mdata = metadata.new()
 218
 219         # Documentation:
 220         # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions
 221         # Reference:
 222         # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44
 223         mdata = metadata.get_standard_metadata(
 224             schema=schema,
 225             key_properties=stream_metadata.get('key_properties', None),
 226             valid_replication_keys=stream_metadata.get('replication_keys', None),
 227             replication_method=stream_metadata.get('replication_method', None)
 228         )
 229         field_metadata[stream_name] = mdata
 230
 231         if stream_name == 'spreadsheet_metadata':
 232             api = stream_metadata.get('api', 'sheets')
 233             params = stream_metadata.get('params', {})
 234             querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()])
 235             path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
 236                 spreadsheet_id), querystring)
 237
 238             # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet)
 239             spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \
 240                 endpoint=stream_name)
 241
 242             sheets = spreadsheet_md_results.get('sheets')
 243             if sheets:
 244                 # Loop thru each worksheet in spreadsheet
 245                 for sheet in sheets:
 246                     # GET sheet_json_schema for each worksheet (from function above)
 247                     sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
 248                     LOGGER.info('columns = {}'.format(columns))
 249
 250                     sheet_title = sheet.get('properties', {}).get('title')
 251                     schemas[sheet_title] = sheet_json_schema
 252                     sheet_mdata = metadata.new()
 253                     sheet_mdata = metadata.get_standard_metadata(
 254                         schema=sheet_json_schema,
 255                         key_properties=['__sdc_row'],
 256                         valid_replication_keys=None,
 257                         replication_method='FULL_TABLE'
 258                     )
 259                     field_metadata[sheet_title] = sheet_mdata
 260
 261     return schemas, field_metadata