[github/fretlink/tap-google-sheets.git] / tap_google_sheets / schema.py

import os
import json
from collections import OrderedDict
import singer
from singer import metadata
from tap_google_sheets.streams import STREAMS

LOGGER = singer.get_logger()

# Reference:
# https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata

# Convert column index to column letter
def colnum_string(num):
    string = ""
    while num > 0:
        num, remainder = divmod(num - 1, 26)
        string = chr(65 + remainder) + string
    return string


# Create sheet_metadata_json with columns from sheet
def get_sheet_schema_columns(sheet):
    sheet_json_schema = OrderedDict()
    data = next(iter(sheet.get('data', [])), {})
    row_data = data.get('rowData', [])
    # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse

    headers = row_data[0].get('values', [])
    first_values = row_data[1].get('values', [])
    # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))

    sheet_json_schema['type'] = 'object'
    sheet_json_schema['additionalProperties'] = False
    sheet_json_schema = {
        'type': 'object',
        'additionalProperties': False,
        'properties': {
            '__sdc_spreadsheet_id': {
                'type': ['null', 'string']
            },
            '__sdc_sheet_id': {
                'type': ['null', 'integer']
            },
            '__sdc_row': {
                'type': ['null', 'integer']
            }
        }
    }

    header_list = [] # used for checking uniqueness
    columns = []
    prior_header = None
    i = 0
    skipped = 0
    # Read column headers until end or 2 consecutive skipped headers
    for header in headers:
        # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
        column_index = i + 1
        column_letter = colnum_string(column_index)
        header_value = header.get('formattedValue')
        if header_value: # NOT skipped
            column_is_skipped = False
            skipped = 0
            column_name = '{}'.format(header_value)
            if column_name in header_list:
                raise Exception('DUPLICATE HEADER ERROR: {}'.format(column_name))
            header_list.append(column_name)

            first_value = first_values[i]

            column_effective_value = first_value.get('effectiveValue', {})
            for key in column_effective_value.keys():
                if key in ('numberValue', 'stringValue', 'boolValue', 'errorType', 'formulaType'):
                    column_effective_value_type = key

            column_number_format = first_values[i].get('effectiveFormat', {}).get(
                'numberFormat', {})
            column_number_format_type = column_number_format.get('type')

            # Determine datatype for sheet_json_schema
            #
            # column_effective_value_type = numberValue, stringValue, boolValue;
            #  INVALID: errorType, formulaType
            #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
            #
            # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
            #   TIME, DATE_TIME, SCIENTIFIC
            #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
            #
            column_format = None # Default
            # column_multiple_of = None # Default
            if column_effective_value_type == 'stringValue':
                column_type = ['null', 'string']
                column_gs_type = 'stringValue'
            elif column_effective_value_type == 'boolValue':
                column_type = ['null', 'boolean', 'string']
                column_gs_type = 'boolValue'
            elif column_effective_value_type == 'numberValue':
                if column_number_format_type == 'DATE_TIME':
                    column_type = ['null', 'string']
                    column_format = 'date-time'
                    column_gs_type = 'numberType.DATE_TIME'
                elif column_number_format_type == 'DATE':
                    column_type = ['null', 'string']
                    column_format = 'date'
                    column_gs_type = 'numberType.DATE'
                elif column_number_format_type == 'TIME':
                    column_type = ['null', 'string']
                    column_format = 'time'
                    column_gs_type = 'numberType.TIME'
                elif column_number_format_type == 'TEXT':
                    column_type = ['null', 'string']
                    column_gs_type = 'stringValue'
                else:
                    column_type = ['null', 'number', 'string']
                    column_gs_type = 'numberType'
            elif column_effective_value_type in ('formulaValue', 'errorValue'):
                raise Exception('INVALID DATA TYPE ERROR: {}, value: {}'.format(column_name, \
                    column_effective_value_type))
        else: # skipped
            column_is_skipped = True
            skipped = skipped + 1
            column_index_str = str(column_index).zfill(2)
            column_name = '__sdc_skip_col_{}'.format(column_index_str)
            column_type = ['null', 'string']
            column_format = None
            column_gs_type = 'stringValue'

        if skipped >= 2:
            # skipped = 2 consecutive skipped headers
            # Remove prior_header column_name
            sheet_json_schema['properties'].pop(prior_header, None)
            break

        else:
            column = {}
            column = {
                'columnIndex': column_index,
                'columnLetter': column_letter,
                'columnName': column_name,
                'columnType': column_gs_type,
                'columnSkipped': column_is_skipped
            }
            columns.append(column)

            sheet_json_schema['properties'][column_name] = column
            sheet_json_schema['properties'][column_name]['type'] = column_type
            if column_format:
                sheet_json_schema['properties'][column_name]['format'] = column_format

        prior_header = column_name
        i = i + 1

    return sheet_json_schema, columns


def get_sheet_metadata(sheet, spreadsheet_id, client):
    sheet_id = sheet.get('properties', {}).get('sheetId')
    sheet_title = sheet.get('properties', {}).get('title')
    LOGGER.info('sheet_id = {}, sheet_title = {}'.format(sheet_id, sheet_title))

    stream_name = 'sheet_metadata'
    stream_metadata = STREAMS.get(stream_name)
    api = stream_metadata.get('api', 'sheets')
    params = stream_metadata.get('params', {})
    querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \
        params.items()]).replace('{sheet_title}', sheet_title)
    path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
        spreadsheet_id), querystring)

    sheet_md_results = client.get(path=path, api=api, endpoint=stream_name)
    sheet_cols = sheet_md_results.get('sheets')[0]
    sheet_schema, columns = get_sheet_schema_columns(sheet_cols)

    return sheet_schema, columns


def get_abs_path(path):
    return os.path.join(os.path.dirname(os.path.realpath(__file__)), path)

def get_schemas(client, spreadsheet_id):
    schemas = {}
    field_metadata = {}

    for stream_name, stream_metadata in STREAMS.items():
        schema_path = get_abs_path('schemas/{}.json'.format(stream_name))
        with open(schema_path) as file:
            schema = json.load(file)
        schemas[stream_name] = schema
        mdata = metadata.new()

        # Documentation:
        # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions
        # Reference:
        # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44
        mdata = metadata.get_standard_metadata(
            schema=schema,
            key_properties=stream_metadata.get('key_properties', None),
            valid_replication_keys=stream_metadata.get('replication_keys', None),
            replication_method=stream_metadata.get('replication_method', None)
        )
        field_metadata[stream_name] = mdata

        if stream_name == 'spreadsheet_metadata':
            api = stream_metadata.get('api', 'sheets')
            params = stream_metadata.get('params', {})
            querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()])
            path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
                spreadsheet_id), querystring)

            spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \
                endpoint=stream_name)

            sheets = spreadsheet_md_results.get('sheets')
            if sheets:
                for sheet in sheets:
                    sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
                    LOGGER.info('columns = {}'.format(columns))

                    sheet_title = sheet.get('properties', {}).get('title')
                    schemas[sheet_title] = sheet_schema
                    sheet_mdata = metadata.new()
                    sheet_mdata = metadata.get_standard_metadata(
                        schema=sheet_schema,
                        key_properties=['__sdc_row'],
                        valid_replication_keys=None,
                        replication_method='FULL_TABLE'
                    )
                    field_metadata[sheet_title] = sheet_mdata

    return schemas, field_metadata
Commit	Line	Data
89643ba6 JH	1	import os
	2	import json
	3	from collections import OrderedDict
	4	import singer
	5	from singer import metadata
	6	from tap_google_sheets.streams import STREAMS
	7
	8	LOGGER = singer.get_logger()
	9
	10	# Reference:
	11	# https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#Metadata
	12
	13	# Convert column index to column letter
99424fee	14	def colnum_string(num):
89643ba6	15	string = ""
99424fee JH	16	while num > 0:
99424fee JH	17	num, remainder = divmod(num - 1, 26)
89643ba6 JH	18	string = chr(65 + remainder) + string
	19	return string
	20
	21
	22	# Create sheet_metadata_json with columns from sheet
99424fee	23	def get_sheet_schema_columns(sheet):
89643ba6 JH	24	sheet_json_schema = OrderedDict()
89643ba6 JH	25	data = next(iter(sheet.get('data', [])), {})
99424fee	26	row_data = data.get('rowData', [])
89643ba6 JH	27	# spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
	28
	29	headers = row_data[0].get('values', [])
	30	first_values = row_data[1].get('values', [])
	31	# LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
	32
	33	sheet_json_schema['type'] = 'object'
	34	sheet_json_schema['additionalProperties'] = False
	35	sheet_json_schema = {
	36	'type': 'object',
	37	'additionalProperties': False,
	38	'properties': {
	39	'__sdc_spreadsheet_id': {
	40	'type': ['null', 'string']
	41	},
	42	'__sdc_sheet_id': {
	43	'type': ['null', 'integer']
	44	},
	45	'__sdc_row': {
	46	'type': ['null', 'integer']
	47	}
	48	}
	49	}
	50
	51	header_list = [] # used for checking uniqueness
	52	columns = []
	53	prior_header = None
	54	i = 0
	55	skipped = 0
	56	# Read column headers until end or 2 consecutive skipped headers
	57	for header in headers:
	58	# LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
	59	column_index = i + 1
	60	column_letter = colnum_string(column_index)
	61	header_value = header.get('formattedValue')
	62	if header_value: # NOT skipped
	63	column_is_skipped = False
	64	skipped = 0
	65	column_name = '{}'.format(header_value)
	66	if column_name in header_list:
	67	raise Exception('DUPLICATE HEADER ERROR: {}'.format(column_name))
99424fee	68	header_list.append(column_name)
89643ba6 JH	69
89643ba6 JH	70	first_value = first_values[i]
89643ba6 JH	71
	72	column_effective_value = first_value.get('effectiveValue', {})
	73	for key in column_effective_value.keys():
	74	if key in ('numberValue', 'stringValue', 'boolValue', 'errorType', 'formulaType'):
	75	column_effective_value_type = key
	76
99424fee JH	77	column_number_format = first_values[i].get('effectiveFormat', {}).get(
99424fee JH	78	'numberFormat', {})
89643ba6 JH	79	column_number_format_type = column_number_format.get('type')
	80
	81	# Determine datatype for sheet_json_schema
	82	#
99424fee JH	83	# column_effective_value_type = numberValue, stringValue, boolValue;
	84	# INVALID: errorType, formulaType
	85	# https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
89643ba6	86	#
99424fee JH	87	# column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
	88	# TIME, DATE_TIME, SCIENTIFIC
	89	# https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
89643ba6 JH	90	#
	91	column_format = None # Default
	92	# column_multiple_of = None # Default
99424fee	93	if column_effective_value_type == 'stringValue':
89643ba6 JH	94	column_type = ['null', 'string']
	95	column_gs_type = 'stringValue'
	96	elif column_effective_value_type == 'boolValue':
	97	column_type = ['null', 'boolean', 'string']
	98	column_gs_type = 'boolValue'
	99	elif column_effective_value_type == 'numberValue':
	100	if column_number_format_type == 'DATE_TIME':
	101	column_type = ['null', 'string']
	102	column_format = 'date-time'
	103	column_gs_type = 'numberType.DATE_TIME'
	104	elif column_number_format_type == 'DATE':
	105	column_type = ['null', 'string']
	106	column_format = 'date'
	107	column_gs_type = 'numberType.DATE'
	108	elif column_number_format_type == 'TIME':
	109	column_type = ['null', 'string']
	110	column_format = 'time'
	111	column_gs_type = 'numberType.TIME'
	112	elif column_number_format_type == 'TEXT':
	113	column_type = ['null', 'string']
	114	column_gs_type = 'stringValue'
	115	else:
	116	column_type = ['null', 'number', 'string']
	117	column_gs_type = 'numberType'
99424fee JH	118	elif column_effective_value_type in ('formulaValue', 'errorValue'):
	119	raise Exception('INVALID DATA TYPE ERROR: {}, value: {}'.format(column_name, \
	120	column_effective_value_type))
89643ba6 JH	121	else: # skipped
	122	column_is_skipped = True
	123	skipped = skipped + 1
	124	column_index_str = str(column_index).zfill(2)
	125	column_name = '__sdc_skip_col_{}'.format(column_index_str)
	126	column_type = ['null', 'string']
	127	column_format = None
	128	column_gs_type = 'stringValue'
	129
	130	if skipped >= 2:
	131	# skipped = 2 consecutive skipped headers
	132	# Remove prior_header column_name
	133	sheet_json_schema['properties'].pop(prior_header, None)
89643ba6 JH	134	break
	135
	136	else:
	137	column = {}
	138	column = {
	139	'columnIndex': column_index,
	140	'columnLetter': column_letter,
	141	'columnName': column_name,
	142	'columnType': column_gs_type,
	143	'columnSkipped': column_is_skipped
	144	}
	145	columns.append(column)
	146
	147	sheet_json_schema['properties'][column_name] = column
	148	sheet_json_schema['properties'][column_name]['type'] = column_type
	149	if column_format:
	150	sheet_json_schema['properties'][column_name]['format'] = column_format
	151
	152	prior_header = column_name
	153	i = i + 1
	154
	155	return sheet_json_schema, columns
	156
	157
	158	def get_sheet_metadata(sheet, spreadsheet_id, client):
	159	sheet_id = sheet.get('properties', {}).get('sheetId')
	160	sheet_title = sheet.get('properties', {}).get('title')
	161	LOGGER.info('sheet_id = {}, sheet_title = {}'.format(sheet_id, sheet_title))
	162
	163	stream_name = 'sheet_metadata'
	164	stream_metadata = STREAMS.get(stream_name)
	165	api = stream_metadata.get('api', 'sheets')
	166	params = stream_metadata.get('params', {})
99424fee JH	167	querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \
	168	params.items()]).replace('{sheet_title}', sheet_title)
	169	path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
	170	spreadsheet_id), querystring)
89643ba6 JH	171
	172	sheet_md_results = client.get(path=path, api=api, endpoint=stream_name)
	173	sheet_cols = sheet_md_results.get('sheets')[0]
99424fee	174	sheet_schema, columns = get_sheet_schema_columns(sheet_cols)
89643ba6 JH	175
	176	return sheet_schema, columns
	177
	178
	179	def get_abs_path(path):
	180	return os.path.join(os.path.dirname(os.path.realpath(__file__)), path)
	181
	182	def get_schemas(client, spreadsheet_id):
	183	schemas = {}
	184	field_metadata = {}
	185
	186	for stream_name, stream_metadata in STREAMS.items():
	187	schema_path = get_abs_path('schemas/{}.json'.format(stream_name))
	188	with open(schema_path) as file:
	189	schema = json.load(file)
	190	schemas[stream_name] = schema
	191	mdata = metadata.new()
	192
	193	# Documentation:
	194	# https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions
	195	# Reference:
	196	# https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44
	197	mdata = metadata.get_standard_metadata(
	198	schema=schema,
	199	key_properties=stream_metadata.get('key_properties', None),
	200	valid_replication_keys=stream_metadata.get('replication_keys', None),
	201	replication_method=stream_metadata.get('replication_method', None)
	202	)
	203	field_metadata[stream_name] = mdata
99424fee	204
89643ba6 JH	205	if stream_name == 'spreadsheet_metadata':
	206	api = stream_metadata.get('api', 'sheets')
	207	params = stream_metadata.get('params', {})
	208	querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()])
99424fee JH	209	path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \
99424fee JH	210	spreadsheet_id), querystring)
89643ba6	211
99424fee JH	212	spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \
99424fee JH	213	endpoint=stream_name)
89643ba6 JH	214
	215	sheets = spreadsheet_md_results.get('sheets')
	216	if sheets:
	217	for sheet in sheets:
	218	sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
99424fee	219	LOGGER.info('columns = {}'.format(columns))
89643ba6 JH	220
	221	sheet_title = sheet.get('properties', {}).get('title')
	222	schemas[sheet_title] = sheet_schema
	223	sheet_mdata = metadata.new()
	224	sheet_mdata = metadata.get_standard_metadata(
	225	schema=sheet_schema,
	226	key_properties=['__sdc_row'],
	227	valid_replication_keys=None,
	228	replication_method='FULL_TABLE'
	229	)
	230	field_metadata[sheet_title] = sheet_mdata
99424fee	231
89643ba6	232	return schemas, field_metadata