first commit with project files
This commit is contained in:
parent
3fb286e08a
commit
e23f32db91
26
backup_postgres_single_db.sh
Executable file
26
backup_postgres_single_db.sh
Executable file
@ -0,0 +1,26 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
|
||||||
|
# Load variables from .env
|
||||||
|
source .env
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
CONTAINER_NAME=postgres # Replace with your container name
|
||||||
|
POSTGRES_USER=$POSTGRES_USER # Replace with your PostgreSQL username
|
||||||
|
POSTGRES_PASSWORD=$POSTGRES_PASSWORD # Replace with your PostgreSQL password (if required)
|
||||||
|
POSTGRES_DB=ams # Replace with the database you want to back up
|
||||||
|
BACKUP_DIR=/home/ams/postgres/backups # Directory to store backup files
|
||||||
|
TIMESTAMP=$(date +"%Y%m%d%H%M%S")
|
||||||
|
BACKUP_FILE=$BACKUP_DIR/${POSTGRES_DB}_backup_$TIMESTAMP.sql
|
||||||
|
|
||||||
|
# Ensure the backup directory exists
|
||||||
|
mkdir -p $BACKUP_DIR
|
||||||
|
|
||||||
|
# Execute pg_dump inside the Docker container
|
||||||
|
docker exec -e PGPASSWORD=$POSTGRES_PASSWORD -t $CONTAINER_NAME pg_dump -U $POSTGRES_USER $POSTGRES_DB > $BACKUP_FILE
|
||||||
|
|
||||||
|
# Optional: Compress the backup file to save space
|
||||||
|
gzip $BACKUP_FILE
|
||||||
|
|
||||||
|
# Optional: Remove backups older than 7 days
|
||||||
|
#find $BACKUP_DIR -type f -name "${POSTGRES_DB}_backup_*.sql.gz" -mtime +7 -delete
|
||||||
48
clean.py
Normal file
48
clean.py
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def sanitize_column_name(col):
|
||||||
|
# Remove newline characters and leading/trailing whitespace
|
||||||
|
col = col.replace('\n', ' ').replace('\r', ' ').strip()
|
||||||
|
# Replace spaces and special characters with underscores
|
||||||
|
col = re.sub(r'\W+', '_', col)
|
||||||
|
# Ensure the column name is not empty
|
||||||
|
if not col:
|
||||||
|
col = 'column'
|
||||||
|
return col
|
||||||
|
|
||||||
|
def make_unique(columns):
|
||||||
|
counts = {}
|
||||||
|
new_columns = []
|
||||||
|
for col in columns:
|
||||||
|
if col in counts:
|
||||||
|
counts[col] += 1
|
||||||
|
new_col = f"{col}_{counts[col]+1}"
|
||||||
|
else:
|
||||||
|
counts[col] = 0
|
||||||
|
new_col = col
|
||||||
|
new_columns.append(new_col)
|
||||||
|
return new_columns
|
||||||
|
|
||||||
|
def sanitize_csv_columns(input_csv, output_csv):
|
||||||
|
# Read the CSV file into a DataFrame
|
||||||
|
df = pd.read_csv(input_csv, low_memory=False)
|
||||||
|
|
||||||
|
# Sanitize column names
|
||||||
|
df.columns = [sanitize_column_name(col) for col in df.columns]
|
||||||
|
|
||||||
|
# Make column names unique by appending a number
|
||||||
|
df.columns = make_unique(df.columns)
|
||||||
|
|
||||||
|
# Save the sanitized DataFrame to a new CSV file
|
||||||
|
df.to_csv(output_csv, index=False)
|
||||||
|
print(f"Sanitized CSV saved to {output_csv}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) != 3:
|
||||||
|
print("Usage: python sanitize_columns.py input_csv output_csv")
|
||||||
|
else:
|
||||||
|
input_csv = sys.argv[1]
|
||||||
|
output_csv = sys.argv[2]
|
||||||
|
sanitize_csv_columns(input_csv, output_csv)
|
||||||
84
csv_postgres.py
Normal file
84
csv_postgres.py
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
from dotenv import load_dotenv
|
||||||
|
import os
|
||||||
|
import glob
|
||||||
|
import pandas as pd
|
||||||
|
import psycopg2
|
||||||
|
import shutil # Import shutil to move files
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
# Directory containing your CSV files
|
||||||
|
csv_dir = '/home/ams/postgres/csv_files/'
|
||||||
|
|
||||||
|
# Directory where processed CSV files will be moved
|
||||||
|
csv_dir_old = '/home/ams/postgres/csv_files_old/'
|
||||||
|
|
||||||
|
# Ensure the csv_dir_old exists
|
||||||
|
if not os.path.exists(csv_dir_old):
|
||||||
|
os.makedirs(csv_dir_old)
|
||||||
|
|
||||||
|
# Get a list of all CSV files in the directory
|
||||||
|
csv_files = glob.glob(os.path.join(csv_dir, '*.csv'))
|
||||||
|
|
||||||
|
# Connect to the PostgreSQL database
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host="172.26.0.3",
|
||||||
|
database="analytics_team",
|
||||||
|
user=os.getenv("POSTGRES_USER"),
|
||||||
|
password=os.getenv("POSTGRES_PASSWORD")
|
||||||
|
)
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
for csv_file in csv_files:
|
||||||
|
# Read the CSV file into a DataFrame with low_memory=False
|
||||||
|
df = pd.read_csv(csv_file, low_memory=False)
|
||||||
|
|
||||||
|
# Drop columns that are completely empty
|
||||||
|
df.dropna(axis=1, how='all', inplace=True)
|
||||||
|
|
||||||
|
# Replace NaN values with None to handle NULLs in PostgreSQL
|
||||||
|
df = df.where(pd.notnull(df), None)
|
||||||
|
|
||||||
|
# Get the filename without the extension
|
||||||
|
filename = os.path.splitext(os.path.basename(csv_file))[0]
|
||||||
|
|
||||||
|
# Define the table name
|
||||||
|
table_name = f'survey_data_{filename}'
|
||||||
|
|
||||||
|
# Drop the table if it already exists
|
||||||
|
cur.execute(f'DROP TABLE IF EXISTS "{table_name}";')
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# Generate the CREATE TABLE query based on DataFrame's columns and data types
|
||||||
|
columns = []
|
||||||
|
for col, dtype in zip(df.columns, df.dtypes):
|
||||||
|
col_name = col.replace('"', '""') # Escape double quotes in column names
|
||||||
|
if 'int' in str(dtype):
|
||||||
|
columns.append(f'"{col_name}" INTEGER')
|
||||||
|
elif 'float' in str(dtype):
|
||||||
|
columns.append(f'"{col_name}" FLOAT')
|
||||||
|
else:
|
||||||
|
columns.append(f'"{col_name}" TEXT')
|
||||||
|
|
||||||
|
create_table_query = f'CREATE TABLE "{table_name}" ({", ".join(columns)});'
|
||||||
|
print(f"Creating table {table_name}...")
|
||||||
|
|
||||||
|
# Execute the CREATE TABLE query
|
||||||
|
cur.execute(create_table_query)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# Insert DataFrame records into the table
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
placeholders = ', '.join(['%s'] * len(row))
|
||||||
|
insert_query = f'INSERT INTO "{table_name}" VALUES ({placeholders});'
|
||||||
|
cur.execute(insert_query, tuple(row))
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
print(f"Data imported into {table_name} successfully!")
|
||||||
|
|
||||||
|
# Move the processed file to the 'csv_files_old' directory
|
||||||
|
shutil.move(csv_file, os.path.join(csv_dir_old, os.path.basename(csv_file)))
|
||||||
|
|
||||||
|
# Close the cursor and connection
|
||||||
|
cur.close()
|
||||||
|
conn.close()
|
||||||
55
docker-compose.yml
Normal file
55
docker-compose.yml
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
|
||||||
|
services:
|
||||||
|
postgres:
|
||||||
|
image: postgres:15
|
||||||
|
container_name: postgres
|
||||||
|
environment:
|
||||||
|
POSTGRES_USER: ${POSTGRES_USER}
|
||||||
|
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
|
||||||
|
POSTGRES_DB: ${POSTGRES_DB}
|
||||||
|
volumes:
|
||||||
|
- postgres_data:/var/lib/postgresql/data # Use Docker-managed volume
|
||||||
|
- /home/ams/postgres/csv_files:/data
|
||||||
|
ports:
|
||||||
|
- "5432:5432"
|
||||||
|
networks:
|
||||||
|
- postgres-network
|
||||||
|
restart: always
|
||||||
|
|
||||||
|
pgadmin:
|
||||||
|
image: dpage/pgadmin4
|
||||||
|
container_name: pgadmin
|
||||||
|
environment:
|
||||||
|
PGADMIN_DEFAULT_EMAIL: ${PGADMIN_DEFAULT_EMAIL}
|
||||||
|
PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_DEFAULT_PASSWORD}
|
||||||
|
PGADMIN_CONFIG_WTF_CSRF_CHECK_DEFAULT: 'False'
|
||||||
|
PGADMIN_CONFIG_WTF_CSRF_ENABLED: 'False'
|
||||||
|
volumes:
|
||||||
|
- pgadmin_data:/var/lib/pgadmin # Use Docker-managed volume
|
||||||
|
- /home/ams/postgres/csv_files:/pgadmin/storage
|
||||||
|
ports:
|
||||||
|
- "5050:80"
|
||||||
|
networks:
|
||||||
|
- postgres-network
|
||||||
|
restart: always
|
||||||
|
|
||||||
|
|
||||||
|
tunnel:
|
||||||
|
container_name: cloudflared-postgres
|
||||||
|
image: cloudflare/cloudflared
|
||||||
|
restart: unless-stopped
|
||||||
|
command: tunnel run
|
||||||
|
environment:
|
||||||
|
- TUNNEL_TOKEN=${TUNNEL_TOKEN}
|
||||||
|
networks:
|
||||||
|
- postgres-network
|
||||||
|
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
postgres_data: # Docker-managed volume for PostgreSQL data
|
||||||
|
pgadmin_data: # Docker-managed volume for pgAdmin data
|
||||||
|
|
||||||
|
|
||||||
|
networks:
|
||||||
|
postgres-network:
|
||||||
|
driver: bridge
|
||||||
Loading…
Reference in New Issue
Block a user