49 lines
1.4 KiB
Python
49 lines
1.4 KiB
Python
import pandas as pd
|
|
import re
|
|
import sys
|
|
|
|
def sanitize_column_name(col):
|
|
# Remove newline characters and leading/trailing whitespace
|
|
col = col.replace('\n', ' ').replace('\r', ' ').strip()
|
|
# Replace spaces and special characters with underscores
|
|
col = re.sub(r'\W+', '_', col)
|
|
# Ensure the column name is not empty
|
|
if not col:
|
|
col = 'column'
|
|
return col
|
|
|
|
def make_unique(columns):
|
|
counts = {}
|
|
new_columns = []
|
|
for col in columns:
|
|
if col in counts:
|
|
counts[col] += 1
|
|
new_col = f"{col}_{counts[col]+1}"
|
|
else:
|
|
counts[col] = 0
|
|
new_col = col
|
|
new_columns.append(new_col)
|
|
return new_columns
|
|
|
|
def sanitize_csv_columns(input_csv, output_csv):
|
|
# Read the CSV file into a DataFrame
|
|
df = pd.read_csv(input_csv, low_memory=False)
|
|
|
|
# Sanitize column names
|
|
df.columns = [sanitize_column_name(col) for col in df.columns]
|
|
|
|
# Make column names unique by appending a number
|
|
df.columns = make_unique(df.columns)
|
|
|
|
# Save the sanitized DataFrame to a new CSV file
|
|
df.to_csv(output_csv, index=False)
|
|
print(f"Sanitized CSV saved to {output_csv}")
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) != 3:
|
|
print("Usage: python sanitize_columns.py input_csv output_csv")
|
|
else:
|
|
input_csv = sys.argv[1]
|
|
output_csv = sys.argv[2]
|
|
sanitize_csv_columns(input_csv, output_csv)
|