v.0.0.4 Logic to skip empty sheets (#4)v0.0.4

* v.0.0.2 schema and sync changes Change number json schema to anyOf with multipleOf; skip empty rows; move write_bookmark to end of sync.py * v.0.0.3 Sync activate version and error handling Update README.md documentation. Improved logging and handling of errors and warnings. Better null handling in Discovery and Sync. Fix issues with activate version messages. * v.0.0.4 Skip empty worksheets Add logic to skip empty worksheets in Discovery and Sync mode. * schema.py fix number datatype issue Nomber datatypes are being created as strings in targets. The JSON schema order needs to be adjusted so that order is null, number, string.
author: Jeff Huth <39202799+jeffhuth-bytecode@users.noreply.github.com> 2020-02-24 09:53:26 -0800
committer: GitHub <noreply@github.com> 2020-02-24 12:53:26 -0500
commit: 376f1145837541d4fff2ad0e499236761f9873c3 (patch)
tree: cc086f18b24bda8a86c16c3ec742b89947f382ae /tap_google_sheets/schema.py
parent: f1d1d43c6b74a8705e91e908c582e39c68464c0c (diff)
download: tap-google-sheets-376f1145837541d4fff2ad0e499236761f9873c3.tar.gz
tap-google-sheets-376f1145837541d4fff2ad0e499236761f9873c3.tar.zst
tap-google-sheets-376f1145837541d4fff2ad0e499236761f9873c3.zip
1 files changed, 185 insertions, 180 deletions
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py
index e319c03..c229d72 100644
--- a/tap_google_sheets/schema.py
+++ b/tap_google_sheets/schema.py
@@ -25,184 +25,188 @@ def get_sheet_schema_columns(sheet):
    sheet_json_schema = OrderedDict()
    data = next(iter(sheet.get('data', [])), {})
    row_data = data.get('rowData', [])
-    # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
+    if row_data == []:
+        # Empty sheet, SKIP
-    headers = row_data[0].get('values', [])
+        LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title))
-    first_values = row_data[1].get('values', [])
+        return None, None
-    # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
+    else:
+        # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
-    sheet_json_schema = {
+        headers = row_data[0].get('values', [])
-        'type': 'object',
+        first_values = row_data[1].get('values', [])
-        'additionalProperties': False,
+        # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
-        'properties': {
-            '__sdc_spreadsheet_id': {
+        sheet_json_schema = {
-                'type': ['null', 'string']
+            'type': 'object',
-            },
+            'additionalProperties': False,
-            '__sdc_sheet_id': {
+            'properties': {
-                'type': ['null', 'integer']
+                '__sdc_spreadsheet_id': {
-            },
+                    'type': ['null', 'string']
-            '__sdc_row': {
+                },
-                'type': ['null', 'integer']
+                '__sdc_sheet_id': {
+                    'type': ['null', 'integer']
+                },
+                '__sdc_row': {
+                    'type': ['null', 'integer']
+                }
            }
        }
-    }
+        header_list = [] # used for checking uniqueness
-    header_list = [] # used for checking uniqueness
+        columns = []
-    columns = []
+        prior_header = None
-    prior_header = None
+        i = 0
-    i = 0
+        skipped = 0
-    skipped = 0
+        # Read column headers until end or 2 consecutive skipped headers
-    # Read column headers until end or 2 consecutive skipped headers
+        for header in headers:
-    for header in headers:
+            # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
-        # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
+            column_index = i + 1
-        column_index = i + 1
+            column_letter = colnum_string(column_index)
-        column_letter = colnum_string(column_index)
+            header_value = header.get('formattedValue')
-        header_value = header.get('formattedValue')
+            if header_value: # NOT skipped
-        if header_value: # NOT skipped
+                column_is_skipped = False
-            column_is_skipped = False
+                skipped = 0
-            skipped = 0
+                column_name = '{}'.format(header_value)
-            column_name = '{}'.format(header_value)
+                if column_name in header_list:
-            if column_name in header_list:
+                    raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
-                raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
-                    sheet_title, column_name, column_letter))
-            header_list.append(column_name)
-            first_value = None
-            try:
-                first_value = first_values[i]
-            except IndexError as err:
-                raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
-                    sheet_title, column_name, column_letter, err))
-            
-            column_effective_value = first_value.get('effectiveValue', {})
-            col_val = None
-            if column_effective_value == {}:
-                column_effective_value_type = 'stringValue'
-                LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
-                    sheet_title, column_name, column_letter))
-                LOGGER.info('   Setting column datatype to STRING')
-            else:
-                for key, val in column_effective_value.items():
-                    if key in ('numberValue', 'stringValue', 'boolValue'):
-                        column_effective_value_type = key
-                        col_val = str(val)
-                    elif key in ('errorType', 'formulaType'):
-                        col_val = str(val)
-                        raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
-                            sheet_title, column_name, column_letter, key, col_val))
-            column_number_format = first_values[i].get('effectiveFormat', {}).get(
-                'numberFormat', {})
-            column_number_format_type = column_number_format.get('type')
-            # Determine datatype for sheet_json_schema
-            #
-            # column_effective_value_type = numberValue, stringValue, boolValue;
-            #  INVALID: errorType, formulaType
-            #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
-            #
-            # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
-            #   TIME, DATE_TIME, SCIENTIFIC
-            #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
-            #
-            column_format = None # Default
-            if column_effective_value == {}:
-                col_properties = {'type': ['null', 'string']}
-                column_gs_type = 'stringValue'
-                LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
                        sheet_title, column_name, column_letter))
-                LOGGER.info('   Setting column datatype to STRING')
+                header_list.append(column_name)
-            elif column_effective_value_type == 'stringValue':
-                col_properties = {'type': ['null', 'string']}
+                first_value = None
-                column_gs_type = 'stringValue'
+                try:
-            elif column_effective_value_type == 'boolValue':
+                    first_value = first_values[i]
-                col_properties = {'type': ['null', 'boolean', 'string']}
+                except IndexError as err:
-                column_gs_type = 'boolValue'
+                    raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
-            elif column_effective_value_type == 'numberValue':
+                        sheet_title, column_name, column_letter, err))
-                if column_number_format_type == 'DATE_TIME':
+                
-                    col_properties = {
+                column_effective_value = first_value.get('effectiveValue', {})
-                        'type': ['null', 'string'],
-                        'format': 'date-time'
+                col_val = None
-                    }
+                if column_effective_value == {}:
-                    column_gs_type = 'numberType.DATE_TIME'
+                    column_effective_value_type = 'stringValue'
-                elif column_number_format_type == 'DATE':
+                    LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
-                    col_properties = {
+                        sheet_title, column_name, column_letter))
-                        'type': ['null', 'string'],
+                    LOGGER.info('   Setting column datatype to STRING')
-                        'format': 'date'
+                else:
-                    }
+                    for key, val in column_effective_value.items():
-                    column_gs_type = 'numberType.DATE'
+                        if key in ('numberValue', 'stringValue', 'boolValue'):
-                elif column_number_format_type == 'TIME':
+                            column_effective_value_type = key
-                    col_properties = {
+                            col_val = str(val)
-                        'type': ['null', 'string'],
+                        elif key in ('errorType', 'formulaType'):
-                        'format': 'time'
+                            col_val = str(val)
-                    }
+                            raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
-                    column_gs_type = 'numberType.TIME'
+                                sheet_title, column_name, column_letter, key, col_val))
-                elif column_number_format_type == 'TEXT':
+                column_number_format = first_values[i].get('effectiveFormat', {}).get(
+                    'numberFormat', {})
+                column_number_format_type = column_number_format.get('type')
+                # Determine datatype for sheet_json_schema
+                #
+                # column_effective_value_type = numberValue, stringValue, boolValue;
+                #  INVALID: errorType, formulaType
+                #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
+                #
+                # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
+                #   TIME, DATE_TIME, SCIENTIFIC
+                #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
+                #
+                column_format = None # Default
+                if column_effective_value == {}:
+                    col_properties = {'type': ['null', 'string']}
+                    column_gs_type = 'stringValue'
+                    LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
+                            sheet_title, column_name, column_letter))
+                    LOGGER.info('   Setting column datatype to STRING')
+                elif column_effective_value_type == 'stringValue':
                    col_properties = {'type': ['null', 'string']}
                    column_gs_type = 'stringValue'
+                elif column_effective_value_type == 'boolValue':
+                    col_properties = {'type': ['null', 'boolean', 'string']}
+                    column_gs_type = 'boolValue'
+                elif column_effective_value_type == 'numberValue':
+                    if column_number_format_type == 'DATE_TIME':
+                        col_properties = {
+                            'type': ['null', 'string'],
+                            'format': 'date-time'
+                        }
+                        column_gs_type = 'numberType.DATE_TIME'
+                    elif column_number_format_type == 'DATE':
+                        col_properties = {
+                            'type': ['null', 'string'],
+                            'format': 'date'
+                        }
+                        column_gs_type = 'numberType.DATE'
+                    elif column_number_format_type == 'TIME':
+                        col_properties = {
+                            'type': ['null', 'string'],
+                            'format': 'time'
+                        }
+                        column_gs_type = 'numberType.TIME'
+                    elif column_number_format_type == 'TEXT':
+                        col_properties = {'type': ['null', 'string']}
+                        column_gs_type = 'stringValue'
+                    else:
+                        # Interesting - order in the anyOf makes a difference.
+                        # Number w/ multipleOf must be listed last, otherwise errors occur.
+                        col_properties =  {
+                            'anyOf': [
+                                {
+                                    'type': 'null'
+                                },
+                                {
+                                    'type': 'number',
+                                    'multipleOf': 1e-15
+                                },
+                                {
+                                    'type': 'string'
+                                }
+                            ]
+                        }
+                        column_gs_type = 'numberType'
+                # Catch-all to deal with other types and set to string
+                # column_effective_value_type: formulaValue, errorValue, or other
                else:
-                    # Interesting - order in the anyOf makes a difference.
+                    col_properties = {'type': ['null', 'string']}
-                    # Number w/ multipleOf must be listed last, otherwise errors occur.
+                    column_gs_type = 'unsupportedValue'
-                    col_properties =  {
+                    LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
-                        'anyOf': [
+                            sheet_title, column_name, column_letter, column_effective_value_type, col_val))
-                            {
+                    LOGGER.info('Converting to string.')
-                                'type': 'string'
+            else: # skipped
-                            },
+                column_is_skipped = True
-                            {
+                skipped = skipped + 1
-                                'type': 'null'
+                column_index_str = str(column_index).zfill(2)
-                            },
+                column_name = '__sdc_skip_col_{}'.format(column_index_str)
-                            {
-                                'type': 'number',
-                                'multipleOf': 1e-15
-                            }
-                        ]
-                    }
-                    column_gs_type = 'numberType'
-            # Catch-all to deal with other types and set to string
-            # column_effective_value_type: formulaValue, errorValue, or other
-            else:
                col_properties = {'type': ['null', 'string']}
-                column_gs_type = 'unsupportedValue'
+                column_gs_type = 'stringValue'
-                LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
+                LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
-                        sheet_title, column_name, column_letter, column_effective_value_type, col_val))
+                    sheet_title, column_name, column_letter))
-                LOGGER.info('Converting to string.')
+                LOGGER.info('  This column will be skipped during data loading.')
-        else: # skipped
-            column_is_skipped = True
-            skipped = skipped + 1
-            column_index_str = str(column_index).zfill(2)
-            column_name = '__sdc_skip_col_{}'.format(column_index_str)
-            col_properties = {'type': ['null', 'string']}
-            column_gs_type = 'stringValue'
-            LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
-                sheet_title, column_name, column_letter))
-            LOGGER.info('  This column will be skipped during data loading.')
-        if skipped >= 2:
-            # skipped = 2 consecutive skipped headers
-            # Remove prior_header column_name
-            sheet_json_schema['properties'].pop(prior_header, None)
-            LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
-                sheet_title, column_name, column_letter))
-            break
-        else:
-            column = {}
-            column = {
-                'columnIndex': column_index,
-                'columnLetter': column_letter,
-                'columnName': column_name,
-                'columnType': column_gs_type,
-                'columnSkipped': column_is_skipped
-            }
-            columns.append(column)
-            sheet_json_schema['properties'][column_name] = col_properties
+            if skipped >= 2:
+                # skipped = 2 consecutive skipped headers
+                # Remove prior_header column_name
+                sheet_json_schema['properties'].pop(prior_header, None)
+                LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
+                    sheet_title, column_name, column_letter))
+                break
-        prior_header = column_name
+            else:
-        i = i + 1
+                column = {}
+                column = {
+                    'columnIndex': column_index,
+                    'columnLetter': column_letter,
+                    'columnName': column_name,
+                    'columnType': column_gs_type,
+                    'columnSkipped': column_is_skipped
+                }
+                columns.append(column)
-    return sheet_json_schema, columns
+                sheet_json_schema['properties'][column_name] = col_properties
+            prior_header = column_name
+            i = i + 1
+        return sheet_json_schema, columns
 # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query
@@ -276,17 +280,18 @@ def get_schemas(client, spreadsheet_id):
                for sheet in sheets:
                    # GET sheet_json_schema for each worksheet (from function above)
                    sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
-                    # LOGGER.info('columns = {}'.format(columns))
+                    # SKIP empty sheets (where sheet_json_schema and columns are None)
-                    sheet_title = sheet.get('properties', {}).get('title')
+                    if sheet_json_schema and columns:
-                    schemas[sheet_title] = sheet_json_schema
+                        sheet_title = sheet.get('properties', {}).get('title')
-                    sheet_mdata = metadata.new()
+                        schemas[sheet_title] = sheet_json_schema
-                    sheet_mdata = metadata.get_standard_metadata(
+                        sheet_mdata = metadata.new()
-                        schema=sheet_json_schema,
+                        sheet_mdata = metadata.get_standard_metadata(
-                        key_properties=['__sdc_row'],
+                            schema=sheet_json_schema,
-                        valid_replication_keys=None,
+                            key_properties=['__sdc_row'],
-                        replication_method='FULL_TABLE'
+                            valid_replication_keys=None,
-                    )
+                            replication_method='FULL_TABLE'
-                    field_metadata[sheet_title] = sheet_mdata
+                        )
+                        field_metadata[sheet_title] = sheet_mdata
    return schemas, field_metadata
author	Jeff Huth <39202799+jeffhuth-bytecode@users.noreply.github.com>	2020-02-24 09:53:26 -0800
committer	GitHub <noreply@github.com>	2020-02-24 12:53:26 -0500
commit	376f1145837541d4fff2ad0e499236761f9873c3 (patch)
tree	cc086f18b24bda8a86c16c3ec742b89947f382ae /tap_google_sheets/schema.py
parent	f1d1d43c6b74a8705e91e908c582e39c68464c0c (diff)
download	tap-google-sheets-376f1145837541d4fff2ad0e499236761f9873c3.tar.gz tap-google-sheets-376f1145837541d4fff2ad0e499236761f9873c3.tar.zst tap-google-sheets-376f1145837541d4fff2ad0e499236761f9873c3.zip

diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py index e319c03..c229d72 100644 --- a/tap_google_sheets/schema.py +++ b/tap_google_sheets/schema.py
@@ -25,184 +25,188 @@ def get_sheet_schema_columns(sheet):
25	sheet_json_schema = OrderedDict()	25	sheet_json_schema = OrderedDict()
26	data = next(iter(sheet.get('data', [])), {})	26	data = next(iter(sheet.get('data', [])), {})
27	row_data = data.get('rowData', [])	27	row_data = data.get('rowData', [])
28	# spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse	28	if row_data == []:
29		29	# Empty sheet, SKIP
30	headers = row_data[0].get('values', [])	30	LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title))
31	first_values = row_data[1].get('values', [])	31	return None, None
32	# LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))	32	else:
33		33	# spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
34	sheet_json_schema = {	34	headers = row_data[0].get('values', [])
35	'type': 'object',	35	first_values = row_data[1].get('values', [])
36	'additionalProperties': False,	36	# LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
37	'properties': {	37
38	'__sdc_spreadsheet_id': {	38	sheet_json_schema = {
39	'type': ['null', 'string']	39	'type': 'object',
40	},	40	'additionalProperties': False,
41	'__sdc_sheet_id': {	41	'properties': {
42	'type': ['null', 'integer']	42	'__sdc_spreadsheet_id': {
43	},	43	'type': ['null', 'string']
44	'__sdc_row': {	44	},
45	'type': ['null', 'integer']	45	'__sdc_sheet_id': {
		46	'type': ['null', 'integer']
		47	},
		48	'__sdc_row': {
		49	'type': ['null', 'integer']
		50	}
46	}	51	}
47	}	52	}
48	}	53
49		54	header_list = [] # used for checking uniqueness
50	header_list = [] # used for checking uniqueness	55	columns = []
51	columns = []	56	prior_header = None
52	prior_header = None	57	i = 0
53	i = 0	58	skipped = 0
54	skipped = 0	59	# Read column headers until end or 2 consecutive skipped headers
55	# Read column headers until end or 2 consecutive skipped headers	60	for header in headers:
56	for header in headers:	61	# LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
57	# LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))	62	column_index = i + 1
58	column_index = i + 1	63	column_letter = colnum_string(column_index)
59	column_letter = colnum_string(column_index)	64	header_value = header.get('formattedValue')
60	header_value = header.get('formattedValue')	65	if header_value: # NOT skipped
61	if header_value: # NOT skipped	66	column_is_skipped = False
62	column_is_skipped = False	67	skipped = 0
63	skipped = 0	68	column_name = '{}'.format(header_value)
64	column_name = '{}'.format(header_value)	69	if column_name in header_list:
65	if column_name in header_list:	70	raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
66	raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
67	sheet_title, column_name, column_letter))
68	header_list.append(column_name)
69
70	first_value = None
71	try:
72	first_value = first_values[i]
73	except IndexError as err:
74	raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
75	sheet_title, column_name, column_letter, err))
76
77	column_effective_value = first_value.get('effectiveValue', {})
78
79	col_val = None
80	if column_effective_value == {}:
81	column_effective_value_type = 'stringValue'
82	LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
83	sheet_title, column_name, column_letter))
84	LOGGER.info(' Setting column datatype to STRING')
85	else:
86	for key, val in column_effective_value.items():
87	if key in ('numberValue', 'stringValue', 'boolValue'):
88	column_effective_value_type = key
89	col_val = str(val)
90	elif key in ('errorType', 'formulaType'):
91	col_val = str(val)
92	raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
93	sheet_title, column_name, column_letter, key, col_val))
94
95	column_number_format = first_values[i].get('effectiveFormat', {}).get(
96	'numberFormat', {})
97	column_number_format_type = column_number_format.get('type')
98
99	# Determine datatype for sheet_json_schema
100	#
101	# column_effective_value_type = numberValue, stringValue, boolValue;
102	# INVALID: errorType, formulaType
103	# https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
104	#
105	# column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
106	# TIME, DATE_TIME, SCIENTIFIC
107	# https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
108	#
109	column_format = None # Default
110	if column_effective_value == {}:
111	col_properties = {'type': ['null', 'string']}
112	column_gs_type = 'stringValue'
113	LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
114	sheet_title, column_name, column_letter))	71	sheet_title, column_name, column_letter))
115	LOGGER.info(' Setting column datatype to STRING')	72	header_list.append(column_name)
116	elif column_effective_value_type == 'stringValue':	73
117	col_properties = {'type': ['null', 'string']}	74	first_value = None
118	column_gs_type = 'stringValue'	75	try:
119	elif column_effective_value_type == 'boolValue':	76	first_value = first_values[i]
120	col_properties = {'type': ['null', 'boolean', 'string']}	77	except IndexError as err:
121	column_gs_type = 'boolValue'	78	raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
122	elif column_effective_value_type == 'numberValue':	79	sheet_title, column_name, column_letter, err))
123	if column_number_format_type == 'DATE_TIME':	80
124	col_properties = {	81	column_effective_value = first_value.get('effectiveValue', {})
125	'type': ['null', 'string'],	82
126	'format': 'date-time'	83	col_val = None
127	}	84	if column_effective_value == {}:
128	column_gs_type = 'numberType.DATE_TIME'	85	column_effective_value_type = 'stringValue'
129	elif column_number_format_type == 'DATE':	86	LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
130	col_properties = {	87	sheet_title, column_name, column_letter))
131	'type': ['null', 'string'],	88	LOGGER.info(' Setting column datatype to STRING')
132	'format': 'date'	89	else:
133	}	90	for key, val in column_effective_value.items():
134	column_gs_type = 'numberType.DATE'	91	if key in ('numberValue', 'stringValue', 'boolValue'):
135	elif column_number_format_type == 'TIME':	92	column_effective_value_type = key
136	col_properties = {	93	col_val = str(val)
137	'type': ['null', 'string'],	94	elif key in ('errorType', 'formulaType'):
138	'format': 'time'	95	col_val = str(val)
139	}	96	raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
140	column_gs_type = 'numberType.TIME'	97	sheet_title, column_name, column_letter, key, col_val))
141	elif column_number_format_type == 'TEXT':	98
		99	column_number_format = first_values[i].get('effectiveFormat', {}).get(
		100	'numberFormat', {})
		101	column_number_format_type = column_number_format.get('type')
		102
		103	# Determine datatype for sheet_json_schema
		104	#
		105	# column_effective_value_type = numberValue, stringValue, boolValue;
		106	# INVALID: errorType, formulaType
		107	# https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
		108	#
		109	# column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
		110	# TIME, DATE_TIME, SCIENTIFIC
		111	# https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
		112	#
		113	column_format = None # Default
		114	if column_effective_value == {}:
		115	col_properties = {'type': ['null', 'string']}
		116	column_gs_type = 'stringValue'
		117	LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
		118	sheet_title, column_name, column_letter))
		119	LOGGER.info(' Setting column datatype to STRING')
		120	elif column_effective_value_type == 'stringValue':
142	col_properties = {'type': ['null', 'string']}	121	col_properties = {'type': ['null', 'string']}
143	column_gs_type = 'stringValue'	122	column_gs_type = 'stringValue'
		123	elif column_effective_value_type == 'boolValue':
		124	col_properties = {'type': ['null', 'boolean', 'string']}
		125	column_gs_type = 'boolValue'
		126	elif column_effective_value_type == 'numberValue':
		127	if column_number_format_type == 'DATE_TIME':
		128	col_properties = {
		129	'type': ['null', 'string'],
		130	'format': 'date-time'
		131	}
		132	column_gs_type = 'numberType.DATE_TIME'
		133	elif column_number_format_type == 'DATE':
		134	col_properties = {
		135	'type': ['null', 'string'],
		136	'format': 'date'
		137	}
		138	column_gs_type = 'numberType.DATE'
		139	elif column_number_format_type == 'TIME':
		140	col_properties = {
		141	'type': ['null', 'string'],
		142	'format': 'time'
		143	}
		144	column_gs_type = 'numberType.TIME'
		145	elif column_number_format_type == 'TEXT':
		146	col_properties = {'type': ['null', 'string']}
		147	column_gs_type = 'stringValue'
		148	else:
		149	# Interesting - order in the anyOf makes a difference.
		150	# Number w/ multipleOf must be listed last, otherwise errors occur.
		151	col_properties = {
		152	'anyOf': [
		153	{
		154	'type': 'null'
		155	},
		156	{
		157	'type': 'number',
		158	'multipleOf': 1e-15
		159	},
		160	{
		161	'type': 'string'
		162	}
		163	]
		164	}
		165	column_gs_type = 'numberType'
		166	# Catch-all to deal with other types and set to string
		167	# column_effective_value_type: formulaValue, errorValue, or other
144	else:	168	else:
145	# Interesting - order in the anyOf makes a difference.	169	col_properties = {'type': ['null', 'string']}
146	# Number w/ multipleOf must be listed last, otherwise errors occur.	170	column_gs_type = 'unsupportedValue'
147	col_properties = {	171	LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
148	'anyOf': [	172	sheet_title, column_name, column_letter, column_effective_value_type, col_val))
149	{	173	LOGGER.info('Converting to string.')
150	'type': 'string'	174	else: # skipped
151	},	175	column_is_skipped = True
152	{	176	skipped = skipped + 1
153	'type': 'null'	177	column_index_str = str(column_index).zfill(2)
154	},	178	column_name = '__sdc_skip_col_{}'.format(column_index_str)
155	{
156	'type': 'number',
157	'multipleOf': 1e-15
158	}
159	]
160	}
161	column_gs_type = 'numberType'
162	# Catch-all to deal with other types and set to string
163	# column_effective_value_type: formulaValue, errorValue, or other
164	else:
165	col_properties = {'type': ['null', 'string']}	179	col_properties = {'type': ['null', 'string']}
166	column_gs_type = 'unsupportedValue'	180	column_gs_type = 'stringValue'
167	LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(	181	LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
168	sheet_title, column_name, column_letter, column_effective_value_type, col_val))	182	sheet_title, column_name, column_letter))
169	LOGGER.info('Converting to string.')	183	LOGGER.info(' This column will be skipped during data loading.')
170	else: # skipped
171	column_is_skipped = True
172	skipped = skipped + 1
173	column_index_str = str(column_index).zfill(2)
174	column_name = '__sdc_skip_col_{}'.format(column_index_str)
175	col_properties = {'type': ['null', 'string']}
176	column_gs_type = 'stringValue'
177	LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
178	sheet_title, column_name, column_letter))
179	LOGGER.info(' This column will be skipped during data loading.')
180
181	if skipped >= 2:
182	# skipped = 2 consecutive skipped headers
183	# Remove prior_header column_name
184	sheet_json_schema['properties'].pop(prior_header, None)
185	LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
186	sheet_title, column_name, column_letter))
187	break
188
189	else:
190	column = {}
191	column = {
192	'columnIndex': column_index,
193	'columnLetter': column_letter,
194	'columnName': column_name,
195	'columnType': column_gs_type,
196	'columnSkipped': column_is_skipped
197	}
198	columns.append(column)
199		184
200	sheet_json_schema['properties'][column_name] = col_properties	185	if skipped >= 2:
		186	# skipped = 2 consecutive skipped headers
		187	# Remove prior_header column_name
		188	sheet_json_schema['properties'].pop(prior_header, None)
		189	LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
		190	sheet_title, column_name, column_letter))
		191	break
201		192
202	prior_header = column_name	193	else:
203	i = i + 1	194	column = {}
		195	column = {
		196	'columnIndex': column_index,
		197	'columnLetter': column_letter,
		198	'columnName': column_name,
		199	'columnType': column_gs_type,
		200	'columnSkipped': column_is_skipped
		201	}
		202	columns.append(column)
204		203
205	return sheet_json_schema, columns	204	sheet_json_schema['properties'][column_name] = col_properties
		205
		206	prior_header = column_name
		207	i = i + 1
		208
		209	return sheet_json_schema, columns
206		210
207		211
208	# Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query	212	# Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query
@@ -276,17 +280,18 @@ def get_schemas(client, spreadsheet_id):
276	for sheet in sheets:	280	for sheet in sheets:
277	# GET sheet_json_schema for each worksheet (from function above)	281	# GET sheet_json_schema for each worksheet (from function above)
278	sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)	282	sheet_json_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client)
279	# LOGGER.info('columns = {}'.format(columns))	283
280		284	# SKIP empty sheets (where sheet_json_schema and columns are None)
281	sheet_title = sheet.get('properties', {}).get('title')	285	if sheet_json_schema and columns:
282	schemas[sheet_title] = sheet_json_schema	286	sheet_title = sheet.get('properties', {}).get('title')
283	sheet_mdata = metadata.new()	287	schemas[sheet_title] = sheet_json_schema
284	sheet_mdata = metadata.get_standard_metadata(	288	sheet_mdata = metadata.new()
285	schema=sheet_json_schema,	289	sheet_mdata = metadata.get_standard_metadata(
286	key_properties=['__sdc_row'],	290	schema=sheet_json_schema,
287	valid_replication_keys=None,	291	key_properties=['__sdc_row'],
288	replication_method='FULL_TABLE'	292	valid_replication_keys=None,
289	)	293	replication_method='FULL_TABLE'
290	field_metadata[sheet_title] = sheet_mdata	294	)
		295	field_metadata[sheet_title] = sheet_mdata
291		296
292	return schemas, field_metadata	297	return schemas, field_metadata