# Empty sheet, SKIP
LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title))
return None, None
- else:
- # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
- headers = row_data[0].get('values', [])
- first_values = row_data[1].get('values', [])
- # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
-
- sheet_json_schema = {
- 'type': 'object',
- 'additionalProperties': False,
- 'properties': {
- '__sdc_spreadsheet_id': {
- 'type': ['null', 'string']
- },
- '__sdc_sheet_id': {
- 'type': ['null', 'integer']
- },
- '__sdc_row': {
- 'type': ['null', 'integer']
- }
+ # spreadsheet is an OrderedDict, with orderd sheets and rows in the repsonse
+ headers = row_data[0].get('values', [])
+ first_values = row_data[1].get('values', [])
+ # LOGGER.info('first_values = {}'.format(json.dumps(first_values, indent=2, sort_keys=True)))
+
+ sheet_json_schema = {
+ 'type': 'object',
+ 'additionalProperties': False,
+ 'properties': {
+ '__sdc_spreadsheet_id': {
+ 'type': ['null', 'string']
+ },
+ '__sdc_sheet_id': {
+ 'type': ['null', 'integer']
+ },
+ '__sdc_row': {
+ 'type': ['null', 'integer']
}
}
+ }
+
+ header_list = [] # used for checking uniqueness
+ columns = []
+ prior_header = None
+ i = 0
+ skipped = 0
+ # Read column headers until end or 2 consecutive skipped headers
+ for header in headers:
+ # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
+ column_index = i + 1
+ column_letter = colnum_string(column_index)
+ header_value = header.get('formattedValue')
+ if header_value: # NOT skipped
+ column_is_skipped = False
+ skipped = 0
+ column_name = '{}'.format(header_value)
+ if column_name in header_list:
+ raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
+ sheet_title, column_name, column_letter))
+ header_list.append(column_name)
- header_list = [] # used for checking uniqueness
- columns = []
- prior_header = None
- i = 0
- skipped = 0
- # Read column headers until end or 2 consecutive skipped headers
- for header in headers:
- # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True)))
- column_index = i + 1
- column_letter = colnum_string(column_index)
- header_value = header.get('formattedValue')
- if header_value: # NOT skipped
- column_is_skipped = False
- skipped = 0
- column_name = '{}'.format(header_value)
- if column_name in header_list:
- raise Exception('DUPLICATE HEADER ERROR: SHEET: {}, COL: {}, CELL: {}1'.format(
- sheet_title, column_name, column_letter))
- header_list.append(column_name)
-
- first_value = None
- try:
- first_value = first_values[i]
- except IndexError as err:
- raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
- sheet_title, column_name, column_letter, err))
-
- column_effective_value = first_value.get('effectiveValue', {})
-
- col_val = None
- if column_effective_value == {}:
- column_effective_value_type = 'stringValue'
- LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
+ first_value = None
+ try:
+ first_value = first_values[i]
+ except IndexError as err:
+ raise Exception('NO VALUE IN 2ND ROW FOR HEADER ERROR. SHEET: {}, COL: {}, CELL: {}2. {}'.format(
+ sheet_title, column_name, column_letter, err))
+
+ column_effective_value = first_value.get('effectiveValue', {})
+
+ col_val = None
+ if column_effective_value == {}:
+ column_effective_value_type = 'stringValue'
+ LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
+ sheet_title, column_name, column_letter))
+ LOGGER.info(' Setting column datatype to STRING')
+ else:
+ for key, val in column_effective_value.items():
+ if key in ('numberValue', 'stringValue', 'boolValue'):
+ column_effective_value_type = key
+ col_val = str(val)
+ elif key in ('errorType', 'formulaType'):
+ col_val = str(val)
+ raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
+ sheet_title, column_name, column_letter, key, col_val))
+
+ column_number_format = first_values[i].get('effectiveFormat', {}).get(
+ 'numberFormat', {})
+ column_number_format_type = column_number_format.get('type')
+
+ # Determine datatype for sheet_json_schema
+ #
+ # column_effective_value_type = numberValue, stringValue, boolValue;
+ # INVALID: errorType, formulaType
+ # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
+ #
+ # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
+ # TIME, DATE_TIME, SCIENTIFIC
+ # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
+ #
+ column_format = None # Default
+ if column_effective_value == {}:
+ col_properties = {'type': ['null', 'string']}
+ column_gs_type = 'stringValue'
+ LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
sheet_title, column_name, column_letter))
- LOGGER.info(' Setting column datatype to STRING')
- else:
- for key, val in column_effective_value.items():
- if key in ('numberValue', 'stringValue', 'boolValue'):
- column_effective_value_type = key
- col_val = str(val)
- elif key in ('errorType', 'formulaType'):
- col_val = str(val)
- raise Exception('DATA TYPE ERROR 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
- sheet_title, column_name, column_letter, key, col_val))
-
- column_number_format = first_values[i].get('effectiveFormat', {}).get(
- 'numberFormat', {})
- column_number_format_type = column_number_format.get('type')
-
- # Determine datatype for sheet_json_schema
- #
- # column_effective_value_type = numberValue, stringValue, boolValue;
- # INVALID: errorType, formulaType
- # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/other#ExtendedValue
- #
- # column_number_format_type = UNEPECIFIED, TEXT, NUMBER, PERCENT, CURRENCY, DATE,
- # TIME, DATE_TIME, SCIENTIFIC
- # https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
- #
- column_format = None # Default
- if column_effective_value == {}:
- col_properties = {'type': ['null', 'string']}
- column_gs_type = 'stringValue'
- LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
- sheet_title, column_name, column_letter))
- LOGGER.info(' Setting column datatype to STRING')
- elif column_effective_value_type == 'stringValue':
+ LOGGER.info(' Setting column datatype to STRING')
+ elif column_effective_value_type == 'stringValue':
+ col_properties = {'type': ['null', 'string']}
+ column_gs_type = 'stringValue'
+ elif column_effective_value_type == 'boolValue':
+ col_properties = {'type': ['null', 'boolean', 'string']}
+ column_gs_type = 'boolValue'
+ elif column_effective_value_type == 'numberValue':
+ if column_number_format_type == 'DATE_TIME':
+ col_properties = {
+ 'type': ['null', 'string'],
+ 'format': 'date-time'
+ }
+ column_gs_type = 'numberType.DATE_TIME'
+ elif column_number_format_type == 'DATE':
+ col_properties = {
+ 'type': ['null', 'string'],
+ 'format': 'date'
+ }
+ column_gs_type = 'numberType.DATE'
+ elif column_number_format_type == 'TIME':
+ col_properties = {
+ 'type': ['null', 'string'],
+ 'format': 'time'
+ }
+ column_gs_type = 'numberType.TIME'
+ elif column_number_format_type == 'TEXT':
col_properties = {'type': ['null', 'string']}
column_gs_type = 'stringValue'
- elif column_effective_value_type == 'boolValue':
- col_properties = {'type': ['null', 'boolean', 'string']}
- column_gs_type = 'boolValue'
- elif column_effective_value_type == 'numberValue':
- if column_number_format_type == 'DATE_TIME':
- col_properties = {
- 'type': ['null', 'string'],
- 'format': 'date-time'
- }
- column_gs_type = 'numberType.DATE_TIME'
- elif column_number_format_type == 'DATE':
- col_properties = {
- 'type': ['null', 'string'],
- 'format': 'date'
- }
- column_gs_type = 'numberType.DATE'
- elif column_number_format_type == 'TIME':
- col_properties = {
- 'type': ['null', 'string'],
- 'format': 'time'
- }
- column_gs_type = 'numberType.TIME'
- elif column_number_format_type == 'TEXT':
- col_properties = {'type': ['null', 'string']}
- column_gs_type = 'stringValue'
- else:
- # Interesting - order in the anyOf makes a difference.
- # Number w/ multipleOf must be listed last, otherwise errors occur.
- col_properties = {
- 'anyOf': [
- {
- 'type': 'null'
- },
- {
- 'type': 'number',
- 'multipleOf': 1e-15
- },
- {
- 'type': 'string'
- }
- ]
- }
- column_gs_type = 'numberType'
- # Catch-all to deal with other types and set to string
- # column_effective_value_type: formulaValue, errorValue, or other
else:
- col_properties = {'type': ['null', 'string']}
- column_gs_type = 'unsupportedValue'
- LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
- sheet_title, column_name, column_letter, column_effective_value_type, col_val))
- LOGGER.info('Converting to string.')
- else: # skipped
- column_is_skipped = True
- skipped = skipped + 1
- column_index_str = str(column_index).zfill(2)
- column_name = '__sdc_skip_col_{}'.format(column_index_str)
- col_properties = {'type': ['null', 'string']}
- column_gs_type = 'stringValue'
- LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
- sheet_title, column_name, column_letter))
- LOGGER.info(' This column will be skipped during data loading.')
-
- if skipped >= 2:
- # skipped = 2 consecutive skipped headers
- # Remove prior_header column_name
- sheet_json_schema['properties'].pop(prior_header, None)
- LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
- sheet_title, column_name, column_letter))
- break
-
+ # Interesting - order in the anyOf makes a difference.
+ # Number w/ multipleOf must be listed last, otherwise errors occur.
+ col_properties = {
+ 'anyOf': [
+ {
+ 'type': 'null'
+ },
+ {
+ 'type': 'number',
+ 'multipleOf': 1e-15
+ },
+ {
+ 'type': 'string'
+ }
+ ]
+ }
+ column_gs_type = 'numberType'
+ # Catch-all to deal with other types and set to string
+ # column_effective_value_type: formulaValue, errorValue, or other
else:
- column = {}
- column = {
- 'columnIndex': column_index,
- 'columnLetter': column_letter,
- 'columnName': column_name,
- 'columnType': column_gs_type,
- 'columnSkipped': column_is_skipped
- }
- columns.append(column)
+ col_properties = {'type': ['null', 'string']}
+ column_gs_type = 'unsupportedValue'
+ LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}, VALUE: {}'.format(
+ sheet_title, column_name, column_letter, column_effective_value_type, col_val))
+ LOGGER.info('Converting to string.')
+ else: # skipped
+ column_is_skipped = True
+ skipped = skipped + 1
+ column_index_str = str(column_index).zfill(2)
+ column_name = '__sdc_skip_col_{}'.format(column_index_str)
+ col_properties = {'type': ['null', 'string']}
+ column_gs_type = 'stringValue'
+ LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
+ sheet_title, column_name, column_letter))
+ LOGGER.info(' This column will be skipped during data loading.')
+
+ if skipped >= 2:
+ # skipped = 2 consecutive skipped headers
+ # Remove prior_header column_name
+ sheet_json_schema['properties'].pop(prior_header, None)
+ LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
+ sheet_title, column_name, column_letter))
+ break
+
+ else:
+ column = {}
+ column = {
+ 'columnIndex': column_index,
+ 'columnLetter': column_letter,
+ 'columnName': column_name,
+ 'columnType': column_gs_type,
+ 'columnSkipped': column_is_skipped
+ }
+ columns.append(column)
- sheet_json_schema['properties'][column_name] = col_properties
+ sheet_json_schema['properties'][column_name] = col_properties
- prior_header = column_name
- i = i + 1
+ prior_header = column_name
+ i = i + 1
- return sheet_json_schema, columns
+ return sheet_json_schema, columns
# Get Header Row and 1st data row (Rows 1 & 2) from a Sheet on Spreadsheet w/ sheet_metadata query
sheet_metadata = sheet_md_results.get('sheets')[0]
# Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results)
- sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata)
+ try:
+ sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata)
+ except:
+ LOGGER.info('SKIPPING Malformed sheet: {}'.format(sheet_title))
+ sheet_json_schema, columns = None, None
return sheet_json_schema, columns