tap_google_sheets/schema.py

   1 import os
   2 import json
   3 from collections import OrderedDict
   4 import singer
   5 from singer import metadata
   6 from tap_google_sheets.streams import STREAMS
   7
   8 LOGGER = singer.get_logger()
   9
  10 # Reference:
  11 # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata
  12
  13 # Convert column index to column letter
  14 def colnum_string(n):
  15     string = ""
  16     while n > 0:
  17         n, remainder = divmod(n - 1, 26)
  18         string = chr(65 + remainder) + string
  19     return string
  20
  21
  22 # Create sheet_metadata_json with columns from sheet
  23 def get_sheet_schema_columns(sheet, spreadsheet_id, client):
  24     sheet_json_schema = OrderedDict()
  25     data = next(iter(sheet.get('data', [])), {})
  26     row_data = data.get('rowData',[])
  27     # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
  28
  29     headers = row_data[0].get('values', [])
  30     first_values = row_data[1].get('values', [])
  31     # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
  32
  33     sheet_json_schema['type'] = 'object'
  34     sheet_json_schema['additionalProperties'] = False
  35     sheet_json_schema = {
  36         'type': 'object',
  37         'additionalProperties': False,
  38         'properties': {
  39             '__sdc_spreadsheet_id': {
  40                 'type': ['null', 'string']
  41             },
  42             '__sdc_sheet_id': {
  43                 'type': ['null', 'integer']
  44             },
  45             '__sdc_row': {
  46                 'type': ['null', 'integer']
  47             }
  48         }
  49     }
  50
  51     header_list = [] # used for checking uniqueness
  52     columns = []
  53     prior_header = None
  54     i = 0
  55     skipped = 0
  56     # Read column headers until end or 2 consecutive skipped headers
  57     for header in headers:
  58         # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
  59         column_index = i + 1
  60         column_letter = colnum_string(column_index)
  61         header_value = header.get('formattedValue')
  62         if header_value: # NOT skipped
  63             column_is_skipped = False
  64             skipped = 0
  65             column_name = '{}'.format(header_value)
  66             if column_name in header_list:
  67                 raise Exception('DUPLICATE HEADER ERROR: {}'.format(column_name))
  68             else:
  69                 header_list.append(column_name)
  70
  71             first_value = first_values[i]
  72             # LOGGER.info('first_value[{}] = {}'.format(i, json.dumps(first_value, indent=2, sort_keys=True)))
  73
  74             column_effective_value = first_value.get('effectiveValue', {})
  75             for key in column_effective_value.keys():
  76                 if key in ('numberValue', 'stringValue', 'boolValue', 'errorType', 'formulaType'):
  77                     column_effective_value_type = key
  78
  79             column_number_format = first_values[i].get('effectiveFormat', {}).get('numberFormat', {})
  80             column_number_format_type = column_number_format.get('type')
  81
  82             # Determine datatype for sheet_json_schema
  83             #
  84             # column_effective_value_type = numberValue, stringValue, boolValue; INVALID: errorType, formulaType
  85             #   Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
  86             #
  87             # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE, TIME, DATE_TIME, SCIENTIFIC
  88             #   Reference: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
  89             #
  90             column_format = None # Default
  91             # column_multiple_of = None # Default
  92             if column_effective_value_type in ('formulaValue', 'errorValue'):
  93                 raise Exception('INVALID DATA TYPE ERROR: {}, value: {}'.format(column_name))
  94             elif column_effective_value_type == 'stringValue':
  95                 column_type = ['null', 'string']
  96                 column_gs_type = 'stringValue'
  97             elif column_effective_value_type == 'boolValue':
  98                 column_type = ['null', 'boolean', 'string']
  99                 column_gs_type = 'boolValue'
 100             elif column_effective_value_type == 'numberValue':
 101                 if column_number_format_type == 'DATE_TIME':
 102                     column_type = ['null', 'string']
 103                     column_format = 'date-time'
 104                     column_gs_type = 'numberType.DATE_TIME'
 105                 elif column_number_format_type == 'DATE':
 106                     column_type = ['null', 'string']
 107                     column_format = 'date'
 108                     column_gs_type = 'numberType.DATE'
 109                 elif column_number_format_type == 'TIME':
 110                     column_type = ['null', 'string']
 111                     column_format = 'time'
 112                     column_gs_type = 'numberType.TIME'
 113                 elif column_number_format_type == 'TEXT':
 114                     column_type = ['null', 'string']
 115                     column_gs_type = 'stringValue'
 116                 else:
 117                     column_type = ['null', 'number', 'string']
 118                     column_gs_type = 'numberType'
 119
 120         else: # skipped
 121             column_is_skipped = True
 122             skipped = skipped + 1
 123             column_index_str = str(column_index).zfill(2)
 124             column_name = '__sdc_skip_col_{}'.format(column_index_str)
 125             column_type = ['null', 'string']
 126             column_format = None
 127             column_gs_type = 'stringValue'
 128
 129         if skipped >= 2:
 130             # skipped = 2 consecutive skipped headers
 131             # Remove prior_header column_name
 132             sheet_json_schema['properties'].pop(prior_header, None)
 133             column_count = i - 1
 134             break
 135
 136         else:
 137             column = {}
 138             column = {
 139                 'columnIndex': column_index,
 140                 'columnLetter': column_letter,
 141                 'columnName': column_name,
 142                 'columnType': column_gs_type,
 143                 'columnSkipped': column_is_skipped
 144             }
 145             columns.append(column)
 146
 147             sheet_json_schema['properties'][column_name] = column
 148             sheet_json_schema['properties'][column_name]['type'] = column_type
 149             if column_format:
 150                 sheet_json_schema['properties'][column_name]['format'] = column_format
 151
 152         prior_header = column_name
 153         i = i + 1
 154
 155     return sheet_json_schema, columns
 156
 157
 158 def get_sheet_metadata(sheet, spreadsheet_id, client):
 159     sheet_id = sheet.get('properties', {}).get('sheetId')
 160     sheet_title = sheet.get('properties', {}).get('title')
 161     LOGGER.info('sheet_id = {}, sheet_title = {}'.format(sheet_id, sheet_title))
 162
 163     stream_name = 'sheet_metadata'
 164     stream_metadata = STREAMS.get(stream_name)
 165     api = stream_metadata.get('api', 'sheets')
 166     params = stream_metadata.get('params', {})
 167     querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]).replace('{sheet_title}', sheet_title)
 168     path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', spreadsheet_id), querystring)
 169
 170     sheet_md_results = client.get(path=path, api=api, endpoint=stream_name)
 171     sheet_cols = sheet_md_results.get('sheets')[0]
 172     sheet_schema, columns = get_sheet_schema_columns(sheet_cols, spreadsheet_id, client)
 173
 174     return sheet_schema, columns
 175
 176
 177 def get_abs_path(path):
 178     return os.path.join(os.path.dirname(os.path.realpath(__file__)), path)
 179
 180 def get_schemas(client, spreadsheet_id):
 181     schemas = {}
 182     field_metadata = {}
 183
 184     for stream_name, stream_metadata in STREAMS.items():
 185         schema_path = get_abs_path('schemas/{}.json'.format(stream_name))
 186         with open(schema_path) as file:
 187             schema = json.load(file)
 188         schemas[stream_name] = schema
 189         mdata = metadata.new()
 190
 191         # Documentation:
 192         # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions
 193         # Reference:
 194         # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44
 195         mdata = metadata.get_standard_metadata(
 196             schema=schema,
 197             key_properties=stream_metadata.get('key_properties', None),
 198             valid_replication_keys=stream_metadata.get('replication_keys', None),
 199             replication_method=stream_metadata.get('replication_method', None)
 200         )
 201         field_metadata[stream_name] = mdata
 202
 203         if stream_name == 'spreadsheet_metadata':
 204             api = stream_metadata.get('api', 'sheets')
 205             params = stream_metadata.get('params', {})
 206             querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()])
 207             path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', spreadsheet_id), querystring)
 208
 209             spreadsheet_md_results = client.get(path=path, params=querystring, api=api, endpoint=stream_name)
 210
 211             sheets = spreadsheet_md_results.get('sheets')
 212             if sheets:
 213                 for sheet in sheets:
 214                     sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
 215                     # LOGGER.info('sheet_schema = {}'.format(json.dumps(sheet_schema, indent=2, sort_keys=True)))
 216
 217                     sheet_title = sheet.get('properties', {}).get('title')
 218                     schemas[sheet_title] = sheet_schema
 219                     sheet_mdata = metadata.new()
 220                     sheet_mdata = metadata.get_standard_metadata(
 221                         schema=sheet_schema,
 222                         key_properties=['__sdc_row'],
 223                         valid_replication_keys=None,
 224                         replication_method='FULL_TABLE'
 225                     )
 226                     field_metadata[sheet_title] = sheet_mdata
 227
 228     return schemas, field_metadata