From e23f32db915d5db66565484ee7d2be4ef6bd860b Mon Sep 17 00:00:00 2001 From: Andreas Hauck Date: Wed, 8 Jan 2025 21:45:13 +0000 Subject: [PATCH] first commit with project files --- README.md | 2 + backup_postgres_single_db.sh | 26 +++++++++++ clean.py | 48 +++++++++++++++++++++ csv_postgres.py | 84 ++++++++++++++++++++++++++++++++++++ docker-compose.yml | 55 +++++++++++++++++++++++ 5 files changed, 215 insertions(+) create mode 100644 README.md create mode 100755 backup_postgres_single_db.sh create mode 100644 clean.py create mode 100644 csv_postgres.py create mode 100644 docker-compose.yml diff --git a/README.md b/README.md new file mode 100644 index 0000000..2c37770 --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# AMS_DATA_MINE + diff --git a/backup_postgres_single_db.sh b/backup_postgres_single_db.sh new file mode 100755 index 0000000..168d267 --- /dev/null +++ b/backup_postgres_single_db.sh @@ -0,0 +1,26 @@ +#!/bin/bash + + +# Load variables from .env +source .env + +# Configuration +CONTAINER_NAME=postgres # Replace with your container name +POSTGRES_USER=$POSTGRES_USER # Replace with your PostgreSQL username +POSTGRES_PASSWORD=$POSTGRES_PASSWORD # Replace with your PostgreSQL password (if required) +POSTGRES_DB=ams # Replace with the database you want to back up +BACKUP_DIR=/home/ams/postgres/backups # Directory to store backup files +TIMESTAMP=$(date +"%Y%m%d%H%M%S") +BACKUP_FILE=$BACKUP_DIR/${POSTGRES_DB}_backup_$TIMESTAMP.sql + +# Ensure the backup directory exists +mkdir -p $BACKUP_DIR + +# Execute pg_dump inside the Docker container +docker exec -e PGPASSWORD=$POSTGRES_PASSWORD -t $CONTAINER_NAME pg_dump -U $POSTGRES_USER $POSTGRES_DB > $BACKUP_FILE + +# Optional: Compress the backup file to save space +gzip $BACKUP_FILE + +# Optional: Remove backups older than 7 days +#find $BACKUP_DIR -type f -name "${POSTGRES_DB}_backup_*.sql.gz" -mtime +7 -delete diff --git a/clean.py b/clean.py new file mode 100644 index 0000000..20136a2 --- /dev/null +++ b/clean.py @@ -0,0 +1,48 @@ +import pandas as pd +import re +import sys + +def sanitize_column_name(col): + # Remove newline characters and leading/trailing whitespace + col = col.replace('\n', ' ').replace('\r', ' ').strip() + # Replace spaces and special characters with underscores + col = re.sub(r'\W+', '_', col) + # Ensure the column name is not empty + if not col: + col = 'column' + return col + +def make_unique(columns): + counts = {} + new_columns = [] + for col in columns: + if col in counts: + counts[col] += 1 + new_col = f"{col}_{counts[col]+1}" + else: + counts[col] = 0 + new_col = col + new_columns.append(new_col) + return new_columns + +def sanitize_csv_columns(input_csv, output_csv): + # Read the CSV file into a DataFrame + df = pd.read_csv(input_csv, low_memory=False) + + # Sanitize column names + df.columns = [sanitize_column_name(col) for col in df.columns] + + # Make column names unique by appending a number + df.columns = make_unique(df.columns) + + # Save the sanitized DataFrame to a new CSV file + df.to_csv(output_csv, index=False) + print(f"Sanitized CSV saved to {output_csv}") + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: python sanitize_columns.py input_csv output_csv") + else: + input_csv = sys.argv[1] + output_csv = sys.argv[2] + sanitize_csv_columns(input_csv, output_csv) diff --git a/csv_postgres.py b/csv_postgres.py new file mode 100644 index 0000000..d679a65 --- /dev/null +++ b/csv_postgres.py @@ -0,0 +1,84 @@ +from dotenv import load_dotenv +import os +import glob +import pandas as pd +import psycopg2 +import shutil # Import shutil to move files + +load_dotenv() + +# Directory containing your CSV files +csv_dir = '/home/ams/postgres/csv_files/' + +# Directory where processed CSV files will be moved +csv_dir_old = '/home/ams/postgres/csv_files_old/' + +# Ensure the csv_dir_old exists +if not os.path.exists(csv_dir_old): + os.makedirs(csv_dir_old) + +# Get a list of all CSV files in the directory +csv_files = glob.glob(os.path.join(csv_dir, '*.csv')) + +# Connect to the PostgreSQL database +conn = psycopg2.connect( + host="172.26.0.3", + database="analytics_team", + user=os.getenv("POSTGRES_USER"), + password=os.getenv("POSTGRES_PASSWORD") +) +cur = conn.cursor() + +for csv_file in csv_files: + # Read the CSV file into a DataFrame with low_memory=False + df = pd.read_csv(csv_file, low_memory=False) + + # Drop columns that are completely empty + df.dropna(axis=1, how='all', inplace=True) + + # Replace NaN values with None to handle NULLs in PostgreSQL + df = df.where(pd.notnull(df), None) + + # Get the filename without the extension + filename = os.path.splitext(os.path.basename(csv_file))[0] + + # Define the table name + table_name = f'survey_data_{filename}' + + # Drop the table if it already exists + cur.execute(f'DROP TABLE IF EXISTS "{table_name}";') + conn.commit() + + # Generate the CREATE TABLE query based on DataFrame's columns and data types + columns = [] + for col, dtype in zip(df.columns, df.dtypes): + col_name = col.replace('"', '""') # Escape double quotes in column names + if 'int' in str(dtype): + columns.append(f'"{col_name}" INTEGER') + elif 'float' in str(dtype): + columns.append(f'"{col_name}" FLOAT') + else: + columns.append(f'"{col_name}" TEXT') + + create_table_query = f'CREATE TABLE "{table_name}" ({", ".join(columns)});' + print(f"Creating table {table_name}...") + + # Execute the CREATE TABLE query + cur.execute(create_table_query) + conn.commit() + + # Insert DataFrame records into the table + for index, row in df.iterrows(): + placeholders = ', '.join(['%s'] * len(row)) + insert_query = f'INSERT INTO "{table_name}" VALUES ({placeholders});' + cur.execute(insert_query, tuple(row)) + + conn.commit() + print(f"Data imported into {table_name} successfully!") + + # Move the processed file to the 'csv_files_old' directory + shutil.move(csv_file, os.path.join(csv_dir_old, os.path.basename(csv_file))) + +# Close the cursor and connection +cur.close() +conn.close() diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..8b88cd0 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,55 @@ + +services: + postgres: + image: postgres:15 + container_name: postgres + environment: + POSTGRES_USER: ${POSTGRES_USER} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + POSTGRES_DB: ${POSTGRES_DB} + volumes: + - postgres_data:/var/lib/postgresql/data # Use Docker-managed volume + - /home/ams/postgres/csv_files:/data + ports: + - "5432:5432" + networks: + - postgres-network + restart: always + + pgadmin: + image: dpage/pgadmin4 + container_name: pgadmin + environment: + PGADMIN_DEFAULT_EMAIL: ${PGADMIN_DEFAULT_EMAIL} + PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_DEFAULT_PASSWORD} + PGADMIN_CONFIG_WTF_CSRF_CHECK_DEFAULT: 'False' + PGADMIN_CONFIG_WTF_CSRF_ENABLED: 'False' + volumes: + - pgadmin_data:/var/lib/pgadmin # Use Docker-managed volume + - /home/ams/postgres/csv_files:/pgadmin/storage + ports: + - "5050:80" + networks: + - postgres-network + restart: always + + + tunnel: + container_name: cloudflared-postgres + image: cloudflare/cloudflared + restart: unless-stopped + command: tunnel run + environment: + - TUNNEL_TOKEN=${TUNNEL_TOKEN} + networks: + - postgres-network + + +volumes: + postgres_data: # Docker-managed volume for PostgreSQL data + pgadmin_data: # Docker-managed volume for pgAdmin data + + +networks: + postgres-network: + driver: bridge