first commit with project files

This commit is contained in:
Andreas Hauck 2025-01-08 21:45:13 +00:00
parent 3fb286e08a
commit e23f32db91
5 changed files with 215 additions and 0 deletions

2
README.md Normal file
View File

@ -0,0 +1,2 @@
# AMS_DATA_MINE

26
backup_postgres_single_db.sh Executable file
View File

@ -0,0 +1,26 @@
#!/bin/bash
# Load variables from .env
source .env
# Configuration
CONTAINER_NAME=postgres # Replace with your container name
POSTGRES_USER=$POSTGRES_USER # Replace with your PostgreSQL username
POSTGRES_PASSWORD=$POSTGRES_PASSWORD # Replace with your PostgreSQL password (if required)
POSTGRES_DB=ams # Replace with the database you want to back up
BACKUP_DIR=/home/ams/postgres/backups # Directory to store backup files
TIMESTAMP=$(date +"%Y%m%d%H%M%S")
BACKUP_FILE=$BACKUP_DIR/${POSTGRES_DB}_backup_$TIMESTAMP.sql
# Ensure the backup directory exists
mkdir -p $BACKUP_DIR
# Execute pg_dump inside the Docker container
docker exec -e PGPASSWORD=$POSTGRES_PASSWORD -t $CONTAINER_NAME pg_dump -U $POSTGRES_USER $POSTGRES_DB > $BACKUP_FILE
# Optional: Compress the backup file to save space
gzip $BACKUP_FILE
# Optional: Remove backups older than 7 days
#find $BACKUP_DIR -type f -name "${POSTGRES_DB}_backup_*.sql.gz" -mtime +7 -delete

48
clean.py Normal file
View File

@ -0,0 +1,48 @@
import pandas as pd
import re
import sys
def sanitize_column_name(col):
# Remove newline characters and leading/trailing whitespace
col = col.replace('\n', ' ').replace('\r', ' ').strip()
# Replace spaces and special characters with underscores
col = re.sub(r'\W+', '_', col)
# Ensure the column name is not empty
if not col:
col = 'column'
return col
def make_unique(columns):
counts = {}
new_columns = []
for col in columns:
if col in counts:
counts[col] += 1
new_col = f"{col}_{counts[col]+1}"
else:
counts[col] = 0
new_col = col
new_columns.append(new_col)
return new_columns
def sanitize_csv_columns(input_csv, output_csv):
# Read the CSV file into a DataFrame
df = pd.read_csv(input_csv, low_memory=False)
# Sanitize column names
df.columns = [sanitize_column_name(col) for col in df.columns]
# Make column names unique by appending a number
df.columns = make_unique(df.columns)
# Save the sanitized DataFrame to a new CSV file
df.to_csv(output_csv, index=False)
print(f"Sanitized CSV saved to {output_csv}")
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python sanitize_columns.py input_csv output_csv")
else:
input_csv = sys.argv[1]
output_csv = sys.argv[2]
sanitize_csv_columns(input_csv, output_csv)

84
csv_postgres.py Normal file
View File

@ -0,0 +1,84 @@
from dotenv import load_dotenv
import os
import glob
import pandas as pd
import psycopg2
import shutil # Import shutil to move files
load_dotenv()
# Directory containing your CSV files
csv_dir = '/home/ams/postgres/csv_files/'
# Directory where processed CSV files will be moved
csv_dir_old = '/home/ams/postgres/csv_files_old/'
# Ensure the csv_dir_old exists
if not os.path.exists(csv_dir_old):
os.makedirs(csv_dir_old)
# Get a list of all CSV files in the directory
csv_files = glob.glob(os.path.join(csv_dir, '*.csv'))
# Connect to the PostgreSQL database
conn = psycopg2.connect(
host="172.26.0.3",
database="analytics_team",
user=os.getenv("POSTGRES_USER"),
password=os.getenv("POSTGRES_PASSWORD")
)
cur = conn.cursor()
for csv_file in csv_files:
# Read the CSV file into a DataFrame with low_memory=False
df = pd.read_csv(csv_file, low_memory=False)
# Drop columns that are completely empty
df.dropna(axis=1, how='all', inplace=True)
# Replace NaN values with None to handle NULLs in PostgreSQL
df = df.where(pd.notnull(df), None)
# Get the filename without the extension
filename = os.path.splitext(os.path.basename(csv_file))[0]
# Define the table name
table_name = f'survey_data_{filename}'
# Drop the table if it already exists
cur.execute(f'DROP TABLE IF EXISTS "{table_name}";')
conn.commit()
# Generate the CREATE TABLE query based on DataFrame's columns and data types
columns = []
for col, dtype in zip(df.columns, df.dtypes):
col_name = col.replace('"', '""') # Escape double quotes in column names
if 'int' in str(dtype):
columns.append(f'"{col_name}" INTEGER')
elif 'float' in str(dtype):
columns.append(f'"{col_name}" FLOAT')
else:
columns.append(f'"{col_name}" TEXT')
create_table_query = f'CREATE TABLE "{table_name}" ({", ".join(columns)});'
print(f"Creating table {table_name}...")
# Execute the CREATE TABLE query
cur.execute(create_table_query)
conn.commit()
# Insert DataFrame records into the table
for index, row in df.iterrows():
placeholders = ', '.join(['%s'] * len(row))
insert_query = f'INSERT INTO "{table_name}" VALUES ({placeholders});'
cur.execute(insert_query, tuple(row))
conn.commit()
print(f"Data imported into {table_name} successfully!")
# Move the processed file to the 'csv_files_old' directory
shutil.move(csv_file, os.path.join(csv_dir_old, os.path.basename(csv_file)))
# Close the cursor and connection
cur.close()
conn.close()

55
docker-compose.yml Normal file
View File

@ -0,0 +1,55 @@
services:
postgres:
image: postgres:15
container_name: postgres
environment:
POSTGRES_USER: ${POSTGRES_USER}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
POSTGRES_DB: ${POSTGRES_DB}
volumes:
- postgres_data:/var/lib/postgresql/data # Use Docker-managed volume
- /home/ams/postgres/csv_files:/data
ports:
- "5432:5432"
networks:
- postgres-network
restart: always
pgadmin:
image: dpage/pgadmin4
container_name: pgadmin
environment:
PGADMIN_DEFAULT_EMAIL: ${PGADMIN_DEFAULT_EMAIL}
PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_DEFAULT_PASSWORD}
PGADMIN_CONFIG_WTF_CSRF_CHECK_DEFAULT: 'False'
PGADMIN_CONFIG_WTF_CSRF_ENABLED: 'False'
volumes:
- pgadmin_data:/var/lib/pgadmin # Use Docker-managed volume
- /home/ams/postgres/csv_files:/pgadmin/storage
ports:
- "5050:80"
networks:
- postgres-network
restart: always
tunnel:
container_name: cloudflared-postgres
image: cloudflare/cloudflared
restart: unless-stopped
command: tunnel run
environment:
- TUNNEL_TOKEN=${TUNNEL_TOKEN}
networks:
- postgres-network
volumes:
postgres_data: # Docker-managed volume for PostgreSQL data
pgadmin_data: # Docker-managed volume for pgAdmin data
networks:
postgres-network:
driver: bridge