tap_google_sheets/schema.py

   1 import os
   2 import json
   3 from collections import OrderedDict
   4 import singer
   5 from singer import metadata
   6 from tap_google_sheets.streams import STREAMS
   7
   8 LOGGER = singer.get_logger()
   9
  10 # Reference:
  11 # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata
  12
  13 # Convert column index to column letter
  14 def colnum_string(num):
  15     string = ""
  16     while num > 0:
  17         num, remainder = divmod(num - 1, 26)
  18         string = chr(65 + remainder) + string
  19     return string
  20
  21
  22 # Create sheet_metadata_json with columns from sheet
  23 def get_sheet_schema_columns(sheet):
  24     sheet_title = sheet.get('properties', {}).get('title')
  25     sheet_json_schema = OrderedDict()
  26     data = next(iter(sheet.get('data', [])), {})
  27     row_data = data.get('rowData', [])
  28     if row_data == []:
  29         # Empty sheet, SKIP
  30         LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title))
  31         return None, None
  32     else:
  33         # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
  34         headers = row_data[0].get('values', [])
  35         first_values = row_data[1].get('values', [])
  36         # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
  37
  38         sheet_json_schema = {
  39             'type': 'object',
  40             'additionalProperties': False,
  41             'properties': {
  42                 '__sdc_spreadsheet_id': {
  43                     'type': ['null', 'string']
  44                 },
  45                 '__sdc_sheet_id': {
  46                     'type': ['null', 'integer']
  47                 },
  48                 '__sdc_row': {
  49                     'type': ['null', 'integer']
  50                 }
  51             }
  52         }
  53
  54         header_list = [] # used for checking uniqueness
  55         columns = []
  56         prior_header = None
  57         i = 0
  58         skipped = 0
  59         # Read column headers until end or 2 consecutive skipped headers
  60         for header in headers:
  61             # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
  62             column_index = i + 1
  63             column_letter = colnum_string(column_index)
  64             header_value = header.get('formattedValue')
  65             if header_value: # NOT skipped
  66                 column_is_skipped = False
  67                 skipped = 0
  68                 column_name = '{}'.format(header_value)
  69                 if column_name in header_list:
  70                     raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
  71                         sheet_title, column_name, column_letter))
  72                 header_list.append(column_name)
  73
  74                 first_value = None
  75                 try:
  76                     first_value = first_values[i]
  77                 except IndexError as err:
  78                     raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
  79                         sheet_title, column_name, column_letter, err))
  80
  81                 column_effective_value = first_value.get('effectiveValue', {})
  82
  83                 col_val = None
  84                 if column_effective_value == {}:
  85                     column_effective_value_type = 'stringValue'
  86                     LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
  87                         sheet_title, column_name, column_letter))
  88                     LOGGER.info('   Setting column datatype to STRING')
  89                 else:
  90                     for key, val in column_effective_value.items():
  91                         if key in ('numberValue', 'stringValue', 'boolValue'):
  92                             column_effective_value_type = key
  93                             col_val = str(val)
  94                         elif key in ('errorType', 'formulaType'):
  95                             col_val = str(val)
  96                             raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
  97                                 sheet_title, column_name, column_letter, key, col_val))
  98
  99                 column_number_format = first_values[i].get('effectiveFormat', {}).get(
 100                     'numberFormat', {})
 101                 column_number_format_type = column_number_format.get('type')
 102
 103                 # Determine datatype for sheet_json_schema
 104                 #
 105                 # column_effective_value_type = numberValue, stringValue, boolValue;
 106                 #  INVALID: errorType, formulaType
 107                 #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
 108                 #
 109                 # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
 110                 #   TIME, DATE_TIME, SCIENTIFIC
 111                 #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
 112                 #
 113                 column_format = None # Default
 114                 if column_effective_value == {}:
 115                     col_properties = {'type': ['null', 'string']}
 116                     column_gs_type = 'stringValue'
 117                     LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
 118                             sheet_title, column_name, column_letter))
 119                     LOGGER.info('   Setting column datatype to STRING')
 120                 elif column_effective_value_type == 'stringValue':
 121                     col_properties = {'type': ['null', 'string']}
 122                     column_gs_type = 'stringValue'
 123                 elif column_effective_value_type == 'boolValue':
 124                     col_properties = {'type': ['null', 'boolean', 'string']}
 125                     column_gs_type = 'boolValue'
 126                 elif column_effective_value_type == 'numberValue':
 127                     if column_number_format_type == 'DATE_TIME':
 128                         col_properties = {
 129                             'type': ['null', 'string'],
 130                             'format': 'date-time'
 131                         }
 132                         column_gs_type = 'numberType.DATE_TIME'
 133                     elif column_number_format_type == 'DATE':
 134                         col_properties = {
 135                             'type': ['null', 'string'],
 136                             'format': 'date'
 137                         }
 138                         column_gs_type = 'numberType.DATE'
 139                     elif column_number_format_type == 'TIME':
 140                         col_properties = {
 141                             'type': ['null', 'string'],
 142                             'format': 'time'
 143                         }
 144                         column_gs_type = 'numberType.TIME'
 145                     elif column_number_format_type == 'TEXT':
 146                         col_properties = {'type': ['null', 'string']}
 147                         column_gs_type = 'stringValue'
 148                     else:
 149                         # Interesting - order in the anyOf makes a difference.
 150                         # Number w/ multipleOf must be listed last, otherwise errors occur.
 151                         col_properties =  {
 152                             'anyOf': [
 153                                 {
 154                                     'type': 'null'
 155                                 },
 156                                 {
 157                                     'type': 'number',
 158                                     'multipleOf': 1e-15
 159                                 },
 160                                 {
 161                                     'type': 'string'
 162                                 }
 163                             ]
 164                         }
 165                         column_gs_type = 'numberType'
 166                 # Catch-all to deal with other types and set to string
 167                 # column_effective_value_type: formulaValue, errorValue, or other
 168                 else:
 169                     col_properties = {'type': ['null', 'string']}
 170                     column_gs_type = 'unsupportedValue'
 171                     LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
 172                             sheet_title, column_name, column_letter, column_effective_value_type, col_val))
 173                     LOGGER.info('Converting to string.')
 174             else: # skipped
 175                 column_is_skipped = True
 176                 skipped = skipped + 1
 177                 column_index_str = str(column_index).zfill(2)
 178                 column_name = '__sdc_skip_col_{}'.format(column_index_str)
 179                 col_properties = {'type': ['null', 'string']}
 180                 column_gs_type = 'stringValue'
 181                 LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
 182                     sheet_title, column_name, column_letter))
 183                 LOGGER.info('  This column will be skipped during data loading.')
 184
 185             if skipped >= 2:
 186                 # skipped = 2 consecutive skipped headers
 187                 # Remove prior_header column_name
 188                 sheet_json_schema['properties'].pop(prior_header, None)
 189                 LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
 190                     sheet_title, column_name, column_letter))
 191                 break
 192
 193             else:
 194                 column = {}
 195                 column = {
 196                     'columnIndex': column_index,
 197                     'columnLetter': column_letter,
 198                     'columnName': column_name,
 199                     'columnType': column_gs_type,
 200                     'columnSkipped': column_is_skipped
 201                 }
 202                 columns.append(column)
 203
 204                 sheet_json_schema['properties'][column_name] = col_properties
 205
 206             prior_header = column_name
 207             i = i + 1
 208
 209         return sheet_json_schema, columns
 210
 211
 212 # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query
 213 #   endpoint: spreadsheets/{spreadsheet_id}
 214 #   params: includeGridData = true, ranges = '{sheet_title}'!1:2
 215 # This endpoint includes detailed metadata about each cell - incl. data type, formatting, etc.
 216 def get_sheet_metadata(sheet, spreadsheet_id, client):
 217     sheet_id = sheet.get('properties', {}).get('sheetId')
 218     sheet_title = sheet.get('properties', {}).get('title')
 219     LOGGER.info('sheet_id = {}, sheet_title = {}'.format(sheet_id, sheet_title))
 220
 221     stream_name = 'sheet_metadata'
 222     stream_metadata = STREAMS.get(stream_name)
 223     api = stream_metadata.get('api', 'sheets')
 224     params = stream_metadata.get('params', {})
 225     querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \
 226         params.items()]).replace('{sheet_title}', sheet_title)
 227     path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
 228         spreadsheet_id), querystring)
 229
 230     sheet_md_results = client.get(path=path, api=api, endpoint=stream_name)
 231     # sheet_metadata: 1st `sheets` node in results
 232     sheet_metadata = sheet_md_results.get('sheets')[0]
 233
 234     # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results)
 235     sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata)
 236
 237     return sheet_json_schema, columns
 238
 239
 240 def get_abs_path(path):
 241     return os.path.join(os.path.dirname(os.path.realpath(__file__)), path)
 242
 243 def get_schemas(client, spreadsheet_id):
 244     schemas = {}
 245     field_metadata = {}
 246
 247     for stream_name, stream_metadata in STREAMS.items():
 248         schema_path = get_abs_path('schemas/{}.json'.format(stream_name))
 249         with open(schema_path) as file:
 250             schema = json.load(file)
 251         schemas[stream_name] = schema
 252         mdata = metadata.new()
 253
 254         # Documentation:
 255         # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions
 256         # Reference:
 257         # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44
 258         mdata = metadata.get_standard_metadata(
 259             schema=schema,
 260             key_properties=stream_metadata.get('key_properties', None),
 261             valid_replication_keys=stream_metadata.get('replication_keys', None),
 262             replication_method=stream_metadata.get('replication_method', None)
 263         )
 264         field_metadata[stream_name] = mdata
 265
 266         if stream_name == 'spreadsheet_metadata':
 267             api = stream_metadata.get('api', 'sheets')
 268             params = stream_metadata.get('params', {})
 269             querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()])
 270             path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
 271                 spreadsheet_id), querystring)
 272
 273             # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet)
 274             spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \
 275                 endpoint=stream_name)
 276
 277             sheets = spreadsheet_md_results.get('sheets')
 278             if sheets:
 279                 # Loop thru each worksheet in spreadsheet
 280                 for sheet in sheets:
 281                     # GET sheet_json_schema for each worksheet (from function above)
 282                     sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
 283
 284                     # SKIP empty sheets (where sheet_json_schema and columns are None)
 285                     if sheet_json_schema and columns:
 286                         sheet_title = sheet.get('properties', {}).get('title')
 287                         schemas[sheet_title] = sheet_json_schema
 288                         sheet_mdata = metadata.new()
 289                         sheet_mdata = metadata.get_standard_metadata(
 290                             schema=sheet_json_schema,
 291                             key_properties=['__sdc_row'],
 292                             valid_replication_keys=None,
 293                             replication_method='FULL_TABLE'
 294                         )
 295                         field_metadata[sheet_title] = sheet_mdata
 296
 297     return schemas, field_metadata