AMS_DATA_MINE/clean.py

49 lines
1.4 KiB
Python

import pandas as pd
import re
import sys
def sanitize_column_name(col):
# Remove newline characters and leading/trailing whitespace
col = col.replace('\n', ' ').replace('\r', ' ').strip()
# Replace spaces and special characters with underscores
col = re.sub(r'\W+', '_', col)
# Ensure the column name is not empty
if not col:
col = 'column'
return col
def make_unique(columns):
counts = {}
new_columns = []
for col in columns:
if col in counts:
counts[col] += 1
new_col = f"{col}_{counts[col]+1}"
else:
counts[col] = 0
new_col = col
new_columns.append(new_col)
return new_columns
def sanitize_csv_columns(input_csv, output_csv):
# Read the CSV file into a DataFrame
df = pd.read_csv(input_csv, low_memory=False)
# Sanitize column names
df.columns = [sanitize_column_name(col) for col in df.columns]
# Make column names unique by appending a number
df.columns = make_unique(df.columns)
# Save the sanitized DataFrame to a new CSV file
df.to_csv(output_csv, index=False)
print(f"Sanitized CSV saved to {output_csv}")
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python sanitize_columns.py input_csv output_csv")
else:
input_csv = sys.argv[1]
output_csv = sys.argv[2]
sanitize_csv_columns(input_csv, output_csv)