first commit with project files

2025-01-08 21:45:13 +00:00 · 2025-01-08 21:45:13 +00:00 · e23f32db91
commit e23f32db91
parent 3fb286e08a
5 changed files with 215 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,2 @@
 # AMS_DATA_MINE
--- a/backup_postgres_single_db.sh
+++ b/backup_postgres_single_db.sh
@ -0,0 +1,26 @@
 #!/bin/bash
 # Load variables from .env
 source .env
 # Configuration
 CONTAINER_NAME=postgres    # Replace with your container name
 POSTGRES_USER=$POSTGRES_USER               # Replace with your PostgreSQL username
 POSTGRES_PASSWORD=$POSTGRES_PASSWORD       # Replace with your PostgreSQL password (if required)
 POSTGRES_DB=ams                 # Replace with the database you want to back up
 BACKUP_DIR=/home/ams/postgres/backups          # Directory to store backup files
 TIMESTAMP=$(date +"%Y%m%d%H%M%S")
 BACKUP_FILE=$BACKUP_DIR/${POSTGRES_DB}_backup_$TIMESTAMP.sql
 # Ensure the backup directory exists
 mkdir -p $BACKUP_DIR
 # Execute pg_dump inside the Docker container
 docker exec -e PGPASSWORD=$POSTGRES_PASSWORD -t $CONTAINER_NAME pg_dump -U $POSTGRES_USER $POSTGRES_DB > $BACKUP_FILE
 # Optional: Compress the backup file to save space
 gzip $BACKUP_FILE
 # Optional: Remove backups older than 7 days
 #find $BACKUP_DIR -type f -name "${POSTGRES_DB}_backup_*.sql.gz" -mtime +7 -delete
--- a/clean.py
+++ b/clean.py
@ -0,0 +1,48 @@
 import pandas as pd
 import re
 import sys
 def sanitize_column_name(col):
    # Remove newline characters and leading/trailing whitespace
    col = col.replace('\n', ' ').replace('\r', ' ').strip()
    # Replace spaces and special characters with underscores
    col = re.sub(r'\W+', '_', col)
    # Ensure the column name is not empty
    if not col:
        col = 'column'
    return col
 def make_unique(columns):
    counts = {}
    new_columns = []
    for col in columns:
        if col in counts:
            counts[col] += 1
            new_col = f"{col}_{counts[col]+1}"
        else:
            counts[col] = 0
            new_col = col
        new_columns.append(new_col)
    return new_columns
 def sanitize_csv_columns(input_csv, output_csv):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(input_csv, low_memory=False)
    # Sanitize column names
    df.columns = [sanitize_column_name(col) for col in df.columns]
    # Make column names unique by appending a number
    df.columns = make_unique(df.columns)
    # Save the sanitized DataFrame to a new CSV file
    df.to_csv(output_csv, index=False)
    print(f"Sanitized CSV saved to {output_csv}")
 if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python sanitize_columns.py input_csv output_csv")
    else:
        input_csv = sys.argv[1]
        output_csv = sys.argv[2]
        sanitize_csv_columns(input_csv, output_csv)
--- a/csv_postgres.py
+++ b/csv_postgres.py
@ -0,0 +1,84 @@
 from dotenv import load_dotenv
 import os
 import glob
 import pandas as pd
 import psycopg2
 import shutil  # Import shutil to move files
 load_dotenv()
 # Directory containing your CSV files
 csv_dir = '/home/ams/postgres/csv_files/'
 # Directory where processed CSV files will be moved
 csv_dir_old = '/home/ams/postgres/csv_files_old/'
 # Ensure the csv_dir_old exists
 if not os.path.exists(csv_dir_old):
    os.makedirs(csv_dir_old)
 # Get a list of all CSV files in the directory
 csv_files = glob.glob(os.path.join(csv_dir, '*.csv'))
 # Connect to the PostgreSQL database
 conn = psycopg2.connect(
    host="172.26.0.3",
    database="analytics_team",
    user=os.getenv("POSTGRES_USER"),
    password=os.getenv("POSTGRES_PASSWORD")
 )
 cur = conn.cursor()
 for csv_file in csv_files:
    # Read the CSV file into a DataFrame with low_memory=False
    df = pd.read_csv(csv_file, low_memory=False)
    # Drop columns that are completely empty
    df.dropna(axis=1, how='all', inplace=True)
    # Replace NaN values with None to handle NULLs in PostgreSQL
    df = df.where(pd.notnull(df), None)
    # Get the filename without the extension
    filename = os.path.splitext(os.path.basename(csv_file))[0]
    # Define the table name
    table_name = f'survey_data_{filename}'
    # Drop the table if it already exists
    cur.execute(f'DROP TABLE IF EXISTS "{table_name}";')
    conn.commit()
    # Generate the CREATE TABLE query based on DataFrame's columns and data types
    columns = []
    for col, dtype in zip(df.columns, df.dtypes):
        col_name = col.replace('"', '""')  # Escape double quotes in column names
        if 'int' in str(dtype):
            columns.append(f'"{col_name}" INTEGER')
        elif 'float' in str(dtype):
            columns.append(f'"{col_name}" FLOAT')
        else:
            columns.append(f'"{col_name}" TEXT')
    create_table_query = f'CREATE TABLE "{table_name}" ({", ".join(columns)});'
    print(f"Creating table {table_name}...")
    # Execute the CREATE TABLE query
    cur.execute(create_table_query)
    conn.commit()
    # Insert DataFrame records into the table
    for index, row in df.iterrows():
        placeholders = ', '.join(['%s'] * len(row))
        insert_query = f'INSERT INTO "{table_name}" VALUES ({placeholders});'
        cur.execute(insert_query, tuple(row))
    conn.commit()
    print(f"Data imported into {table_name} successfully!")
    # Move the processed file to the 'csv_files_old' directory
    shutil.move(csv_file, os.path.join(csv_dir_old, os.path.basename(csv_file)))
 # Close the cursor and connection
 cur.close()
 conn.close()
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,55 @@
 services:
  postgres:
    image: postgres:15
    container_name: postgres
    environment:
      POSTGRES_USER: ${POSTGRES_USER}
      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
      POSTGRES_DB: ${POSTGRES_DB}
    volumes:
      - postgres_data:/var/lib/postgresql/data  # Use Docker-managed volume
      - /home/ams/postgres/csv_files:/data
    ports:
      - "5432:5432"
    networks:
      - postgres-network
    restart: always
  pgadmin:
    image: dpage/pgadmin4
    container_name: pgadmin
    environment:
      PGADMIN_DEFAULT_EMAIL: ${PGADMIN_DEFAULT_EMAIL}
      PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_DEFAULT_PASSWORD}
      PGADMIN_CONFIG_WTF_CSRF_CHECK_DEFAULT: 'False'
      PGADMIN_CONFIG_WTF_CSRF_ENABLED: 'False'
    volumes:
      - pgadmin_data:/var/lib/pgadmin  # Use Docker-managed volume
      - /home/ams/postgres/csv_files:/pgadmin/storage
    ports:
      - "5050:80"
    networks:
      - postgres-network
    restart: always
  tunnel:
    container_name: cloudflared-postgres
    image: cloudflare/cloudflared
    restart: unless-stopped
    command: tunnel run
    environment:
      - TUNNEL_TOKEN=${TUNNEL_TOKEN}
    networks:
      - postgres-network
 volumes:
  postgres_data:  # Docker-managed volume for PostgreSQL data
  pgadmin_data:   # Docker-managed volume for pgAdmin data
 networks:
  postgres-network:
    driver: bridge