Skip malformed sheets (#13)

* Skip malformed sheets * Move the try except block up a level for readability sake * Remove debug code
author: cosimon <cosimon@users.noreply.github.com> 2020-04-17 09:09:41 -0400
committer: GitHub <noreply@github.com> 2020-04-17 09:09:41 -0400
commit: f6d7c509f00feb63810df7a9216fe1769195197a (patch)
tree: 0727c5ccc646e49177db4dec43b58df4c02aa2c2
parent: f012a137927591d5d9068d5d499d424ea4112ea9 (diff)
download: tap-google-sheets-f6d7c509f00feb63810df7a9216fe1769195197a.tar.gz
tap-google-sheets-f6d7c509f00feb63810df7a9216fe1769195197a.tar.zst
tap-google-sheets-f6d7c509f00feb63810df7a9216fe1769195197a.zip
2 files changed, 178 insertions, 168 deletions
diff --git a/setup.py b/setup.py
index 27b8573..0582e1e 100644
--- a/setup.py
+++ b/setup.py
@@ -13,6 +13,13 @@ setup(name='tap-google-sheets',
          'requests==2.22.0',
          'singer-python==5.9.0'
      ],
+      extras_require={
+          'dev': [
+              'ipdb==0.11',
+              'pylint',
+              'nose'
+          ]
+      },
      entry_points='''
          [console_scripts]
          tap-google-sheets=tap_google_sheets:main
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py
index c229d72..3e63e75 100644
--- a/tap_google_sheets/schema.py
+++ b/tap_google_sheets/schema.py
@@ -29,184 +29,183 @@ def get_sheet_schema_columns(sheet):
        # Empty sheet, SKIP
        LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title))
        return None, None
-    else:
+    # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
-        # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
+    headers = row_data[0].get('values', [])
-        headers = row_data[0].get('values', [])
+    first_values = row_data[1].get('values', [])
-        first_values = row_data[1].get('values', [])
+    # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
-        # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
+    sheet_json_schema = {
-        sheet_json_schema = {
+        'type': 'object',
-            'type': 'object',
+        'additionalProperties': False,
-            'additionalProperties': False,
+        'properties': {
-            'properties': {
+            '__sdc_spreadsheet_id': {
-                '__sdc_spreadsheet_id': {
+                'type': ['null', 'string']
-                    'type': ['null', 'string']
+            },
-                },
+            '__sdc_sheet_id': {
-                '__sdc_sheet_id': {
+                'type': ['null', 'integer']
-                    'type': ['null', 'integer']
+            },
-                },
+            '__sdc_row': {
-                '__sdc_row': {
+                'type': ['null', 'integer']
-                    'type': ['null', 'integer']
-                }
            }
        }
+    }
+    header_list = [] # used for checking uniqueness
+    columns = []
+    prior_header = None
+    i = 0
+    skipped = 0
+    # Read column headers until end or 2 consecutive skipped headers
+    for header in headers:
+        # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
+        column_index = i + 1
+        column_letter = colnum_string(column_index)
+        header_value = header.get('formattedValue')
+        if header_value: # NOT skipped
+            column_is_skipped = False
+            skipped = 0
+            column_name = '{}'.format(header_value)
+            if column_name in header_list:
+                raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
+                    sheet_title, column_name, column_letter))
+            header_list.append(column_name)
-        header_list = [] # used for checking uniqueness
+            first_value = None
-        columns = []
+            try:
-        prior_header = None
+                first_value = first_values[i]
-        i = 0
+            except IndexError as err:
-        skipped = 0
+                raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
-        # Read column headers until end or 2 consecutive skipped headers
+                    sheet_title, column_name, column_letter, err))
-        for header in headers:
-            # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
+            column_effective_value = first_value.get('effectiveValue', {})
-            column_index = i + 1
-            column_letter = colnum_string(column_index)
+            col_val = None
-            header_value = header.get('formattedValue')
+            if column_effective_value == {}:
-            if header_value: # NOT skipped
+                column_effective_value_type = 'stringValue'
-                column_is_skipped = False
+                LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
-                skipped = 0
+                    sheet_title, column_name, column_letter))
-                column_name = '{}'.format(header_value)
+                LOGGER.info('   Setting column datatype to STRING')
-                if column_name in header_list:
+            else:
-                    raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
+                for key, val in column_effective_value.items():
-                        sheet_title, column_name, column_letter))
+                    if key in ('numberValue', 'stringValue', 'boolValue'):
-                header_list.append(column_name)
+                        column_effective_value_type = key
+                        col_val = str(val)
-                first_value = None
+                    elif key in ('errorType', 'formulaType'):
-                try:
+                        col_val = str(val)
-                    first_value = first_values[i]
+                        raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
-                except IndexError as err:
+                            sheet_title, column_name, column_letter, key, col_val))
-                    raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
-                        sheet_title, column_name, column_letter, err))
+            column_number_format = first_values[i].get('effectiveFormat', {}).get(
-                
+                'numberFormat', {})
-                column_effective_value = first_value.get('effectiveValue', {})
+            column_number_format_type = column_number_format.get('type')
-                col_val = None
+            # Determine datatype for sheet_json_schema
-                if column_effective_value == {}:
+            #
-                    column_effective_value_type = 'stringValue'
+            # column_effective_value_type = numberValue, stringValue, boolValue;
-                    LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
+            #  INVALID: errorType, formulaType
+            #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
+            #
+            # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
+            #   TIME, DATE_TIME, SCIENTIFIC
+            #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
+            #
+            column_format = None # Default
+            if column_effective_value == {}:
+                col_properties = {'type': ['null', 'string']}
+                column_gs_type = 'stringValue'
+                LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
                        sheet_title, column_name, column_letter))
-                    LOGGER.info('   Setting column datatype to STRING')
+                LOGGER.info('   Setting column datatype to STRING')
-                else:
+            elif column_effective_value_type == 'stringValue':
-                    for key, val in column_effective_value.items():
+                col_properties = {'type': ['null', 'string']}
-                        if key in ('numberValue', 'stringValue', 'boolValue'):
+                column_gs_type = 'stringValue'
-                            column_effective_value_type = key
+            elif column_effective_value_type == 'boolValue':
-                            col_val = str(val)
+                col_properties = {'type': ['null', 'boolean', 'string']}
-                        elif key in ('errorType', 'formulaType'):
+                column_gs_type = 'boolValue'
-                            col_val = str(val)
+            elif column_effective_value_type == 'numberValue':
-                            raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
+                if column_number_format_type == 'DATE_TIME':
-                                sheet_title, column_name, column_letter, key, col_val))
+                    col_properties = {
+                        'type': ['null', 'string'],
-                column_number_format = first_values[i].get('effectiveFormat', {}).get(
+                        'format': 'date-time'
-                    'numberFormat', {})
+                    }
-                column_number_format_type = column_number_format.get('type')
+                    column_gs_type = 'numberType.DATE_TIME'
+                elif column_number_format_type == 'DATE':
-                # Determine datatype for sheet_json_schema
+                    col_properties = {
-                #
+                        'type': ['null', 'string'],
-                # column_effective_value_type = numberValue, stringValue, boolValue;
+                        'format': 'date'
-                #  INVALID: errorType, formulaType
+                    }
-                #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
+                    column_gs_type = 'numberType.DATE'
-                #
+                elif column_number_format_type == 'TIME':
-                # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
+                    col_properties = {
-                #   TIME, DATE_TIME, SCIENTIFIC
+                        'type': ['null', 'string'],
-                #  https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
+                        'format': 'time'
-                #
+                    }
-                column_format = None # Default
+                    column_gs_type = 'numberType.TIME'
-                if column_effective_value == {}:
+                elif column_number_format_type == 'TEXT':
-                    col_properties = {'type': ['null', 'string']}
-                    column_gs_type = 'stringValue'
-                    LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
-                            sheet_title, column_name, column_letter))
-                    LOGGER.info('   Setting column datatype to STRING')
-                elif column_effective_value_type == 'stringValue':
                    col_properties = {'type': ['null', 'string']}
                    column_gs_type = 'stringValue'
-                elif column_effective_value_type == 'boolValue':
-                    col_properties = {'type': ['null', 'boolean', 'string']}
-                    column_gs_type = 'boolValue'
-                elif column_effective_value_type == 'numberValue':
-                    if column_number_format_type == 'DATE_TIME':
-                        col_properties = {
-                            'type': ['null', 'string'],
-                            'format': 'date-time'
-                        }
-                        column_gs_type = 'numberType.DATE_TIME'
-                    elif column_number_format_type == 'DATE':
-                        col_properties = {
-                            'type': ['null', 'string'],
-                            'format': 'date'
-                        }
-                        column_gs_type = 'numberType.DATE'
-                    elif column_number_format_type == 'TIME':
-                        col_properties = {
-                            'type': ['null', 'string'],
-                            'format': 'time'
-                        }
-                        column_gs_type = 'numberType.TIME'
-                    elif column_number_format_type == 'TEXT':
-                        col_properties = {'type': ['null', 'string']}
-                        column_gs_type = 'stringValue'
-                    else:
-                        # Interesting - order in the anyOf makes a difference.
-                        # Number w/ multipleOf must be listed last, otherwise errors occur.
-                        col_properties =  {
-                            'anyOf': [
-                                {
-                                    'type': 'null'
-                                },
-                                {
-                                    'type': 'number',
-                                    'multipleOf': 1e-15
-                                },
-                                {
-                                    'type': 'string'
-                                }
-                            ]
-                        }
-                        column_gs_type = 'numberType'
-                # Catch-all to deal with other types and set to string
-                # column_effective_value_type: formulaValue, errorValue, or other
                else:
-                    col_properties = {'type': ['null', 'string']}
+                    # Interesting - order in the anyOf makes a difference.
-                    column_gs_type = 'unsupportedValue'
+                    # Number w/ multipleOf must be listed last, otherwise errors occur.
-                    LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
+                    col_properties =  {
-                            sheet_title, column_name, column_letter, column_effective_value_type, col_val))
+                        'anyOf': [
-                    LOGGER.info('Converting to string.')
+                            {
-            else: # skipped
+                                'type': 'null'
-                column_is_skipped = True
+                            },
-                skipped = skipped + 1
+                            {
-                column_index_str = str(column_index).zfill(2)
+                                'type': 'number',
-                column_name = '__sdc_skip_col_{}'.format(column_index_str)
+                                'multipleOf': 1e-15
-                col_properties = {'type': ['null', 'string']}
+                            },
-                column_gs_type = 'stringValue'
+                            {
-                LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
+                                'type': 'string'
-                    sheet_title, column_name, column_letter))
+                            }
-                LOGGER.info('  This column will be skipped during data loading.')
+                        ]
+                    }
-            if skipped >= 2:
+                    column_gs_type = 'numberType'
-                # skipped = 2 consecutive skipped headers
+            # Catch-all to deal with other types and set to string
-                # Remove prior_header column_name
+            # column_effective_value_type: formulaValue, errorValue, or other
-                sheet_json_schema['properties'].pop(prior_header, None)
-                LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
-                    sheet_title, column_name, column_letter))
-                break
            else:
-                column = {}
+                col_properties = {'type': ['null', 'string']}
-                column = {
+                column_gs_type = 'unsupportedValue'
-                    'columnIndex': column_index,
+                LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
-                    'columnLetter': column_letter,
+                        sheet_title, column_name, column_letter, column_effective_value_type, col_val))
-                    'columnName': column_name,
+                LOGGER.info('Converting to string.')
-                    'columnType': column_gs_type,
+        else: # skipped
-                    'columnSkipped': column_is_skipped
+            column_is_skipped = True
-                }
+            skipped = skipped + 1
-                columns.append(column)
+            column_index_str = str(column_index).zfill(2)
+            column_name = '__sdc_skip_col_{}'.format(column_index_str)
+            col_properties = {'type': ['null', 'string']}
+            column_gs_type = 'stringValue'
+            LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
+                sheet_title, column_name, column_letter))
+            LOGGER.info('  This column will be skipped during data loading.')
+        if skipped >= 2:
+            # skipped = 2 consecutive skipped headers
+            # Remove prior_header column_name
+            sheet_json_schema['properties'].pop(prior_header, None)
+            LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
+                sheet_title, column_name, column_letter))
+            break
+        else:
+            column = {}
+            column = {
+                'columnIndex': column_index,
+                'columnLetter': column_letter,
+                'columnName': column_name,
+                'columnType': column_gs_type,
+                'columnSkipped': column_is_skipped
+            }
+            columns.append(column)
-                sheet_json_schema['properties'][column_name] = col_properties
+            sheet_json_schema['properties'][column_name] = col_properties
-            prior_header = column_name
+        prior_header = column_name
-            i = i + 1
+        i = i + 1
-        return sheet_json_schema, columns
+    return sheet_json_schema, columns
 # Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query
@@ -232,7 +231,11 @@ def get_sheet_metadata(sheet, spreadsheet_id, client):
    sheet_metadata = sheet_md_results.get('sheets')[0]
    # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results)
-    sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata)
+    try:
+        sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata)
+    except:
+        LOGGER.info('SKIPPING Malformed sheet: {}'.format(sheet_title))
+        sheet_json_schema, columns = None, None
    return sheet_json_schema, columns
author	cosimon <cosimon@users.noreply.github.com>	2020-04-17 09:09:41 -0400
committer	GitHub <noreply@github.com>	2020-04-17 09:09:41 -0400
commit	f6d7c509f00feb63810df7a9216fe1769195197a (patch)
tree	0727c5ccc646e49177db4dec43b58df4c02aa2c2
parent	f012a137927591d5d9068d5d499d424ea4112ea9 (diff)
download	tap-google-sheets-f6d7c509f00feb63810df7a9216fe1769195197a.tar.gz tap-google-sheets-f6d7c509f00feb63810df7a9216fe1769195197a.tar.zst tap-google-sheets-f6d7c509f00feb63810df7a9216fe1769195197a.zip

diff --git a/setup.py b/setup.py index 27b8573..0582e1e 100644 --- a/setup.py +++ b/setup.py
@@ -13,6 +13,13 @@ setup(name='tap-google-sheets',
13	'requests==2.22.0',	13	'requests==2.22.0',
14	'singer-python==5.9.0'	14	'singer-python==5.9.0'
15	],	15	],
		16	extras_require={
		17	'dev': [
		18	'ipdb==0.11',
		19	'pylint',
		20	'nose'
		21	]
		22	},
16	entry_points='''	23	entry_points='''
17	[console_scripts]	24	[console_scripts]
18	tap-google-sheets=tap_google_sheets:main	25	tap-google-sheets=tap_google_sheets:main


diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py index c229d72..3e63e75 100644 --- a/tap_google_sheets/schema.py +++ b/tap_google_sheets/schema.py
@@ -29,184 +29,183 @@ def get_sheet_schema_columns(sheet):
29	# Empty sheet, SKIP	29	# Empty sheet, SKIP
30	LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title))	30	LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title))
31	return None, None	31	return None, None
32	else:	32	# spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
33	# spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse	33	headers = row_data[0].get('values', [])
34	headers = row_data[0].get('values', [])	34	first_values = row_data[1].get('values', [])
35	first_values = row_data[1].get('values', [])	35	# LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
36	# LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))	36
37		37	sheet_json_schema = {
38	sheet_json_schema = {	38	'type': 'object',
39	'type': 'object',	39	'additionalProperties': False,
40	'additionalProperties': False,	40	'properties': {
41	'properties': {	41	'__sdc_spreadsheet_id': {
42	'__sdc_spreadsheet_id': {	42	'type': ['null', 'string']
43	'type': ['null', 'string']	43	},
44	},	44	'__sdc_sheet_id': {
45	'__sdc_sheet_id': {	45	'type': ['null', 'integer']
46	'type': ['null', 'integer']	46	},
47	},	47	'__sdc_row': {
48	'__sdc_row': {	48	'type': ['null', 'integer']
49	'type': ['null', 'integer']
50	}
51	}	49	}
52	}	50	}
		51	}
		52
		53	header_list = [] # used for checking uniqueness
		54	columns = []
		55	prior_header = None
		56	i = 0
		57	skipped = 0
		58	# Read column headers until end or 2 consecutive skipped headers
		59	for header in headers:
		60	# LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
		61	column_index = i + 1
		62	column_letter = colnum_string(column_index)
		63	header_value = header.get('formattedValue')
		64	if header_value: # NOT skipped
		65	column_is_skipped = False
		66	skipped = 0
		67	column_name = '{}'.format(header_value)
		68	if column_name in header_list:
		69	raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
		70	sheet_title, column_name, column_letter))
		71	header_list.append(column_name)
53		72
54	header_list = [] # used for checking uniqueness	73	first_value = None
55	columns = []	74	try:
56	prior_header = None	75	first_value = first_values[i]
57	i = 0	76	except IndexError as err:
58	skipped = 0	77	raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
59	# Read column headers until end or 2 consecutive skipped headers	78	sheet_title, column_name, column_letter, err))
60	for header in headers:	79
61	# LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))	80	column_effective_value = first_value.get('effectiveValue', {})
62	column_index = i + 1	81
63	column_letter = colnum_string(column_index)	82	col_val = None
64	header_value = header.get('formattedValue')	83	if column_effective_value == {}:
65	if header_value: # NOT skipped	84	column_effective_value_type = 'stringValue'
66	column_is_skipped = False	85	LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
67	skipped = 0	86	sheet_title, column_name, column_letter))
68	column_name = '{}'.format(header_value)	87	LOGGER.info(' Setting column datatype to STRING')
69	if column_name in header_list:	88	else:
70	raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(	89	for key, val in column_effective_value.items():
71	sheet_title, column_name, column_letter))	90	if key in ('numberValue', 'stringValue', 'boolValue'):
72	header_list.append(column_name)	91	column_effective_value_type = key
73		92	col_val = str(val)
74	first_value = None	93	elif key in ('errorType', 'formulaType'):
75	try:	94	col_val = str(val)
76	first_value = first_values[i]	95	raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
77	except IndexError as err:	96	sheet_title, column_name, column_letter, key, col_val))
78	raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(	97
79	sheet_title, column_name, column_letter, err))	98	column_number_format = first_values[i].get('effectiveFormat', {}).get(
80		99	'numberFormat', {})
81	column_effective_value = first_value.get('effectiveValue', {})	100	column_number_format_type = column_number_format.get('type')
82		101
83	col_val = None	102	# Determine datatype for sheet_json_schema
84	if column_effective_value == {}:	103	#
85	column_effective_value_type = 'stringValue'	104	# column_effective_value_type = numberValue, stringValue, boolValue;
86	LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(	105	# INVALID: errorType, formulaType
		106	# https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
		107	#
		108	# column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
		109	# TIME, DATE_TIME, SCIENTIFIC
		110	# https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
		111	#
		112	column_format = None # Default
		113	if column_effective_value == {}:
		114	col_properties = {'type': ['null', 'string']}
		115	column_gs_type = 'stringValue'
		116	LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
87	sheet_title, column_name, column_letter))	117	sheet_title, column_name, column_letter))
88	LOGGER.info(' Setting column datatype to STRING')	118	LOGGER.info(' Setting column datatype to STRING')
89	else:	119	elif column_effective_value_type == 'stringValue':
90	for key, val in column_effective_value.items():	120	col_properties = {'type': ['null', 'string']}
91	if key in ('numberValue', 'stringValue', 'boolValue'):	121	column_gs_type = 'stringValue'
92	column_effective_value_type = key	122	elif column_effective_value_type == 'boolValue':
93	col_val = str(val)	123	col_properties = {'type': ['null', 'boolean', 'string']}
94	elif key in ('errorType', 'formulaType'):	124	column_gs_type = 'boolValue'
95	col_val = str(val)	125	elif column_effective_value_type == 'numberValue':
96	raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(	126	if column_number_format_type == 'DATE_TIME':
97	sheet_title, column_name, column_letter, key, col_val))	127	col_properties = {
98		128	'type': ['null', 'string'],
99	column_number_format = first_values[i].get('effectiveFormat', {}).get(	129	'format': 'date-time'
100	'numberFormat', {})	130	}
101	column_number_format_type = column_number_format.get('type')	131	column_gs_type = 'numberType.DATE_TIME'
102		132	elif column_number_format_type == 'DATE':
103	# Determine datatype for sheet_json_schema	133	col_properties = {
104	#	134	'type': ['null', 'string'],
105	# column_effective_value_type = numberValue, stringValue, boolValue;	135	'format': 'date'
106	# INVALID: errorType, formulaType	136	}
107	# https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue	137	column_gs_type = 'numberType.DATE'
108	#	138	elif column_number_format_type == 'TIME':
109	# column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,	139	col_properties = {
110	# TIME, DATE_TIME, SCIENTIFIC	140	'type': ['null', 'string'],
111	# https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType	141	'format': 'time'
112	#	142	}
113	column_format = None # Default	143	column_gs_type = 'numberType.TIME'
114	if column_effective_value == {}:	144	elif column_number_format_type == 'TEXT':
115	col_properties = {'type': ['null', 'string']}
116	column_gs_type = 'stringValue'
117	LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
118	sheet_title, column_name, column_letter))
119	LOGGER.info(' Setting column datatype to STRING')
120	elif column_effective_value_type == 'stringValue':
121	col_properties = {'type': ['null', 'string']}	145	col_properties = {'type': ['null', 'string']}
122	column_gs_type = 'stringValue'	146	column_gs_type = 'stringValue'
123	elif column_effective_value_type == 'boolValue':
124	col_properties = {'type': ['null', 'boolean', 'string']}
125	column_gs_type = 'boolValue'
126	elif column_effective_value_type == 'numberValue':
127	if column_number_format_type == 'DATE_TIME':
128	col_properties = {
129	'type': ['null', 'string'],
130	'format': 'date-time'
131	}
132	column_gs_type = 'numberType.DATE_TIME'
133	elif column_number_format_type == 'DATE':
134	col_properties = {
135	'type': ['null', 'string'],
136	'format': 'date'
137	}
138	column_gs_type = 'numberType.DATE'
139	elif column_number_format_type == 'TIME':
140	col_properties = {
141	'type': ['null', 'string'],
142	'format': 'time'
143	}
144	column_gs_type = 'numberType.TIME'
145	elif column_number_format_type == 'TEXT':
146	col_properties = {'type': ['null', 'string']}
147	column_gs_type = 'stringValue'
148	else:
149	# Interesting - order in the anyOf makes a difference.
150	# Number w/ multipleOf must be listed last, otherwise errors occur.
151	col_properties = {
152	'anyOf': [
153	{
154	'type': 'null'
155	},
156	{
157	'type': 'number',
158	'multipleOf': 1e-15
159	},
160	{
161	'type': 'string'
162	}
163	]
164	}
165	column_gs_type = 'numberType'
166	# Catch-all to deal with other types and set to string
167	# column_effective_value_type: formulaValue, errorValue, or other
168	else:	147	else:
169	col_properties = {'type': ['null', 'string']}	148	# Interesting - order in the anyOf makes a difference.
170	column_gs_type = 'unsupportedValue'	149	# Number w/ multipleOf must be listed last, otherwise errors occur.
171	LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(	150	col_properties = {
172	sheet_title, column_name, column_letter, column_effective_value_type, col_val))	151	'anyOf': [
173	LOGGER.info('Converting to string.')	152	{
174	else: # skipped	153	'type': 'null'
175	column_is_skipped = True	154	},
176	skipped = skipped + 1	155	{
177	column_index_str = str(column_index).zfill(2)	156	'type': 'number',
178	column_name = '__sdc_skip_col_{}'.format(column_index_str)	157	'multipleOf': 1e-15
179	col_properties = {'type': ['null', 'string']}	158	},
180	column_gs_type = 'stringValue'	159	{
181	LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(	160	'type': 'string'
182	sheet_title, column_name, column_letter))	161	}
183	LOGGER.info(' This column will be skipped during data loading.')	162	]
184		163	}
185	if skipped >= 2:	164	column_gs_type = 'numberType'
186	# skipped = 2 consecutive skipped headers	165	# Catch-all to deal with other types and set to string
187	# Remove prior_header column_name	166	# column_effective_value_type: formulaValue, errorValue, or other
188	sheet_json_schema['properties'].pop(prior_header, None)
189	LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
190	sheet_title, column_name, column_letter))
191	break
192
193	else:	167	else:
194	column = {}	168	col_properties = {'type': ['null', 'string']}
195	column = {	169	column_gs_type = 'unsupportedValue'
196	'columnIndex': column_index,	170	LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
197	'columnLetter': column_letter,	171	sheet_title, column_name, column_letter, column_effective_value_type, col_val))
198	'columnName': column_name,	172	LOGGER.info('Converting to string.')
199	'columnType': column_gs_type,	173	else: # skipped
200	'columnSkipped': column_is_skipped	174	column_is_skipped = True
201	}	175	skipped = skipped + 1
202	columns.append(column)	176	column_index_str = str(column_index).zfill(2)
		177	column_name = '__sdc_skip_col_{}'.format(column_index_str)
		178	col_properties = {'type': ['null', 'string']}
		179	column_gs_type = 'stringValue'
		180	LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
		181	sheet_title, column_name, column_letter))
		182	LOGGER.info(' This column will be skipped during data loading.')
		183
		184	if skipped >= 2:
		185	# skipped = 2 consecutive skipped headers
		186	# Remove prior_header column_name
		187	sheet_json_schema['properties'].pop(prior_header, None)
		188	LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
		189	sheet_title, column_name, column_letter))
		190	break
		191
		192	else:
		193	column = {}
		194	column = {
		195	'columnIndex': column_index,
		196	'columnLetter': column_letter,
		197	'columnName': column_name,
		198	'columnType': column_gs_type,
		199	'columnSkipped': column_is_skipped
		200	}
		201	columns.append(column)
203		202
204	sheet_json_schema['properties'][column_name] = col_properties	203	sheet_json_schema['properties'][column_name] = col_properties
205		204
206	prior_header = column_name	205	prior_header = column_name
207	i = i + 1	206	i = i + 1
208		207
209	return sheet_json_schema, columns	208	return sheet_json_schema, columns
210		209
211		210
212	# Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query	211	# Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query
@@ -232,7 +231,11 @@ def get_sheet_metadata(sheet, spreadsheet_id, client):
232	sheet_metadata = sheet_md_results.get('sheets')[0]	231	sheet_metadata = sheet_md_results.get('sheets')[0]
233		232
234	# Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results)	233	# Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results)
235	sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata)	234	try:
		235	sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata)
		236	except:
		237	LOGGER.info('SKIPPING Malformed sheet: {}'.format(sheet_title))
		238	sheet_json_schema, columns = None, None
236		239
237	return sheet_json_schema, columns	240	return sheet_json_schema, columns
238		241