first commit with project files
This commit is contained in:
parent
3fb286e08a
commit
e23f32db91
26
backup_postgres_single_db.sh
Executable file
26
backup_postgres_single_db.sh
Executable file
@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
|
||||
|
||||
# Load variables from .env
|
||||
source .env
|
||||
|
||||
# Configuration
|
||||
CONTAINER_NAME=postgres # Replace with your container name
|
||||
POSTGRES_USER=$POSTGRES_USER # Replace with your PostgreSQL username
|
||||
POSTGRES_PASSWORD=$POSTGRES_PASSWORD # Replace with your PostgreSQL password (if required)
|
||||
POSTGRES_DB=ams # Replace with the database you want to back up
|
||||
BACKUP_DIR=/home/ams/postgres/backups # Directory to store backup files
|
||||
TIMESTAMP=$(date +"%Y%m%d%H%M%S")
|
||||
BACKUP_FILE=$BACKUP_DIR/${POSTGRES_DB}_backup_$TIMESTAMP.sql
|
||||
|
||||
# Ensure the backup directory exists
|
||||
mkdir -p $BACKUP_DIR
|
||||
|
||||
# Execute pg_dump inside the Docker container
|
||||
docker exec -e PGPASSWORD=$POSTGRES_PASSWORD -t $CONTAINER_NAME pg_dump -U $POSTGRES_USER $POSTGRES_DB > $BACKUP_FILE
|
||||
|
||||
# Optional: Compress the backup file to save space
|
||||
gzip $BACKUP_FILE
|
||||
|
||||
# Optional: Remove backups older than 7 days
|
||||
#find $BACKUP_DIR -type f -name "${POSTGRES_DB}_backup_*.sql.gz" -mtime +7 -delete
|
||||
48
clean.py
Normal file
48
clean.py
Normal file
@ -0,0 +1,48 @@
|
||||
import pandas as pd
|
||||
import re
|
||||
import sys
|
||||
|
||||
def sanitize_column_name(col):
|
||||
# Remove newline characters and leading/trailing whitespace
|
||||
col = col.replace('\n', ' ').replace('\r', ' ').strip()
|
||||
# Replace spaces and special characters with underscores
|
||||
col = re.sub(r'\W+', '_', col)
|
||||
# Ensure the column name is not empty
|
||||
if not col:
|
||||
col = 'column'
|
||||
return col
|
||||
|
||||
def make_unique(columns):
|
||||
counts = {}
|
||||
new_columns = []
|
||||
for col in columns:
|
||||
if col in counts:
|
||||
counts[col] += 1
|
||||
new_col = f"{col}_{counts[col]+1}"
|
||||
else:
|
||||
counts[col] = 0
|
||||
new_col = col
|
||||
new_columns.append(new_col)
|
||||
return new_columns
|
||||
|
||||
def sanitize_csv_columns(input_csv, output_csv):
|
||||
# Read the CSV file into a DataFrame
|
||||
df = pd.read_csv(input_csv, low_memory=False)
|
||||
|
||||
# Sanitize column names
|
||||
df.columns = [sanitize_column_name(col) for col in df.columns]
|
||||
|
||||
# Make column names unique by appending a number
|
||||
df.columns = make_unique(df.columns)
|
||||
|
||||
# Save the sanitized DataFrame to a new CSV file
|
||||
df.to_csv(output_csv, index=False)
|
||||
print(f"Sanitized CSV saved to {output_csv}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: python sanitize_columns.py input_csv output_csv")
|
||||
else:
|
||||
input_csv = sys.argv[1]
|
||||
output_csv = sys.argv[2]
|
||||
sanitize_csv_columns(input_csv, output_csv)
|
||||
84
csv_postgres.py
Normal file
84
csv_postgres.py
Normal file
@ -0,0 +1,84 @@
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
import glob
|
||||
import pandas as pd
|
||||
import psycopg2
|
||||
import shutil # Import shutil to move files
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# Directory containing your CSV files
|
||||
csv_dir = '/home/ams/postgres/csv_files/'
|
||||
|
||||
# Directory where processed CSV files will be moved
|
||||
csv_dir_old = '/home/ams/postgres/csv_files_old/'
|
||||
|
||||
# Ensure the csv_dir_old exists
|
||||
if not os.path.exists(csv_dir_old):
|
||||
os.makedirs(csv_dir_old)
|
||||
|
||||
# Get a list of all CSV files in the directory
|
||||
csv_files = glob.glob(os.path.join(csv_dir, '*.csv'))
|
||||
|
||||
# Connect to the PostgreSQL database
|
||||
conn = psycopg2.connect(
|
||||
host="172.26.0.3",
|
||||
database="analytics_team",
|
||||
user=os.getenv("POSTGRES_USER"),
|
||||
password=os.getenv("POSTGRES_PASSWORD")
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
for csv_file in csv_files:
|
||||
# Read the CSV file into a DataFrame with low_memory=False
|
||||
df = pd.read_csv(csv_file, low_memory=False)
|
||||
|
||||
# Drop columns that are completely empty
|
||||
df.dropna(axis=1, how='all', inplace=True)
|
||||
|
||||
# Replace NaN values with None to handle NULLs in PostgreSQL
|
||||
df = df.where(pd.notnull(df), None)
|
||||
|
||||
# Get the filename without the extension
|
||||
filename = os.path.splitext(os.path.basename(csv_file))[0]
|
||||
|
||||
# Define the table name
|
||||
table_name = f'survey_data_{filename}'
|
||||
|
||||
# Drop the table if it already exists
|
||||
cur.execute(f'DROP TABLE IF EXISTS "{table_name}";')
|
||||
conn.commit()
|
||||
|
||||
# Generate the CREATE TABLE query based on DataFrame's columns and data types
|
||||
columns = []
|
||||
for col, dtype in zip(df.columns, df.dtypes):
|
||||
col_name = col.replace('"', '""') # Escape double quotes in column names
|
||||
if 'int' in str(dtype):
|
||||
columns.append(f'"{col_name}" INTEGER')
|
||||
elif 'float' in str(dtype):
|
||||
columns.append(f'"{col_name}" FLOAT')
|
||||
else:
|
||||
columns.append(f'"{col_name}" TEXT')
|
||||
|
||||
create_table_query = f'CREATE TABLE "{table_name}" ({", ".join(columns)});'
|
||||
print(f"Creating table {table_name}...")
|
||||
|
||||
# Execute the CREATE TABLE query
|
||||
cur.execute(create_table_query)
|
||||
conn.commit()
|
||||
|
||||
# Insert DataFrame records into the table
|
||||
for index, row in df.iterrows():
|
||||
placeholders = ', '.join(['%s'] * len(row))
|
||||
insert_query = f'INSERT INTO "{table_name}" VALUES ({placeholders});'
|
||||
cur.execute(insert_query, tuple(row))
|
||||
|
||||
conn.commit()
|
||||
print(f"Data imported into {table_name} successfully!")
|
||||
|
||||
# Move the processed file to the 'csv_files_old' directory
|
||||
shutil.move(csv_file, os.path.join(csv_dir_old, os.path.basename(csv_file)))
|
||||
|
||||
# Close the cursor and connection
|
||||
cur.close()
|
||||
conn.close()
|
||||
55
docker-compose.yml
Normal file
55
docker-compose.yml
Normal file
@ -0,0 +1,55 @@
|
||||
|
||||
services:
|
||||
postgres:
|
||||
image: postgres:15
|
||||
container_name: postgres
|
||||
environment:
|
||||
POSTGRES_USER: ${POSTGRES_USER}
|
||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
|
||||
POSTGRES_DB: ${POSTGRES_DB}
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data # Use Docker-managed volume
|
||||
- /home/ams/postgres/csv_files:/data
|
||||
ports:
|
||||
- "5432:5432"
|
||||
networks:
|
||||
- postgres-network
|
||||
restart: always
|
||||
|
||||
pgadmin:
|
||||
image: dpage/pgadmin4
|
||||
container_name: pgadmin
|
||||
environment:
|
||||
PGADMIN_DEFAULT_EMAIL: ${PGADMIN_DEFAULT_EMAIL}
|
||||
PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_DEFAULT_PASSWORD}
|
||||
PGADMIN_CONFIG_WTF_CSRF_CHECK_DEFAULT: 'False'
|
||||
PGADMIN_CONFIG_WTF_CSRF_ENABLED: 'False'
|
||||
volumes:
|
||||
- pgadmin_data:/var/lib/pgadmin # Use Docker-managed volume
|
||||
- /home/ams/postgres/csv_files:/pgadmin/storage
|
||||
ports:
|
||||
- "5050:80"
|
||||
networks:
|
||||
- postgres-network
|
||||
restart: always
|
||||
|
||||
|
||||
tunnel:
|
||||
container_name: cloudflared-postgres
|
||||
image: cloudflare/cloudflared
|
||||
restart: unless-stopped
|
||||
command: tunnel run
|
||||
environment:
|
||||
- TUNNEL_TOKEN=${TUNNEL_TOKEN}
|
||||
networks:
|
||||
- postgres-network
|
||||
|
||||
|
||||
volumes:
|
||||
postgres_data: # Docker-managed volume for PostgreSQL data
|
||||
pgadmin_data: # Docker-managed volume for pgAdmin data
|
||||
|
||||
|
||||
networks:
|
||||
postgres-network:
|
||||
driver: bridge
|
||||
Loading…
Reference in New Issue
Block a user