USCLAP/download_tournament.py

207 lines
9.4 KiB
Python

import os
import time
import re
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def setup_browser(download_dir):
firefox_options = Options()
firefox_options.set_preference('browser.download.folderList', 2)
firefox_options.set_preference('browser.download.dir', download_dir) # Use the tournament-specific directory
firefox_options.set_preference('browser.helperApps.neverAsk.saveToDisk', 'text/plain,text/csv,application/octet-stream')
firefox_options.set_preference('pdfjs.disabled', True)
firefox_options.set_preference('browser.download.manager.showWhenStarting', False)
firefox_options.set_preference('browser.download.manager.useWindow', False)
firefox_options.set_preference('browser.download.manager.focusWhenStarting', False)
firefox_options.set_preference('browser.download.manager.alertOnEXEOpen', False)
firefox_options.set_preference('browser.download.manager.showAlertOnComplete', False)
firefox_options.set_preference('browser.download.manager.closeWhenDone', False)
# Specify the path to geckodriver
gecko_service = Service(executable_path='/usr/local/bin/geckodriver')
browser = webdriver.Firefox(service=gecko_service, options=firefox_options)
return browser
def parse_date(date_str):
date_str = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
try:
tournament_date = datetime.strptime(date_str, '%B %d %Y').date()
print(f"Parsed date: {tournament_date}")
return tournament_date
except ValueError as e:
print(f"Error parsing date '{date_str}': {e}")
return None
def get_tournaments_after_date(browser, date_threshold):
tournaments = []
print("Navigating to the tournaments page...")
tournaments_page_url = 'https://edhtop16.com/tournaments?sortBy=DATE'
browser.get(tournaments_page_url)
time.sleep(3)
# Find all tournament entries
tournament_entries = browser.find_elements(By.CSS_SELECTOR, "div.group.relative.overflow-hidden.rounded-lg.bg-white.shadow")
print(f"Found {len(tournament_entries)} tournaments on the page.")
for entry in tournament_entries:
try:
# Extract tournament link, name, and date
link_element = entry.find_element(By.CSS_SELECTOR, 'a.line-clamp-2.text-xl.font-bold.underline')
tournament_name = link_element.text
tournament_url = link_element.get_attribute('href')
date_element = entry.find_element(By.CSS_SELECTOR, 'span')
tournament_date_str = date_element.text
print(f"Tournament found: {tournament_name}, Date: {tournament_date_str}, URL: {tournament_url}")
# Parse the date string
tournament_date = parse_date(tournament_date_str)
if tournament_date and tournament_date >= date_threshold:
tournaments.append((tournament_url, tournament_name, tournament_date))
except Exception as e:
print(f"Error processing tournament entry: {e}")
print(f"Total tournaments after {date_threshold}: {len(tournaments)}")
return tournaments
def get_tournament_info(browser):
print("Retrieving the tournament name and deck links...")
time.sleep(3) # Wait for the page to load
# Get the tournament name
try:
tournament_name_element = browser.find_element(By.TAG_NAME, 'h1')
tournament_name = tournament_name_element.text if tournament_name_element else "Tournament"
except Exception as e:
print(f"Error retrieving tournament name: {e}")
tournament_name = "Tournament"
tournament_name = tournament_name.replace('/', '-') # Replace invalid filename characters
print(f"Tournament Name: {tournament_name}")
# Get the list of decks
deck_elements = browser.find_elements(By.CSS_SELECTOR, "a.line-clamp-2.text-xl.font-bold.underline")
deck_links = []
for deck_element in deck_elements:
deck_url = deck_element.get_attribute('href')
deck_name = deck_element.text
deck_links.append((deck_url, deck_name))
print(f"Total decks found: {len(deck_links)}")
return tournament_name, deck_links
def download_deck(browser, deck_url, rank, total_decks, save_dir):
print(f"Navigating to deck {rank} page...")
browser.get(deck_url)
try:
# Wait for the "More" button and click it
print("Waiting for the 'More' button to appear...")
WebDriverWait(browser, 10).until(EC.visibility_of_element_located((By.XPATH, "//span[contains(text(), 'More')]/..")))
more_button = browser.find_element(By.XPATH, "//span[contains(text(), 'More')]/..")
browser.execute_script("arguments[0].click();", more_button)
time.sleep(1)
# Wait for and click the "Export" option within the dropdown
print("Waiting for the 'Export' option...")
WebDriverWait(browser, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[contains(@class, 'dropdown-item') and contains(text(), 'Export')]")))
export_option = browser.find_element(By.XPATH, "//a[contains(@class, 'dropdown-item') and contains(text(), 'Export')]")
browser.execute_script("arguments[0].click();", export_option)
time.sleep(1)
# Wait for and click the "Download for MTGO" link
print("Clicking 'Download for MTGO' link...")
WebDriverWait(browser, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[contains(@class, 'btn btn-primary') and contains(text(), 'Download for MTGO')]")))
mtgo_download_link = browser.find_element(By.XPATH, "//a[contains(@class, 'btn btn-primary') and contains(text(), 'Download for MTGO')]")
mtgo_download_link.click()
time.sleep(2) # Wait for the download to initiate
# Wait for the downloaded file to appear in the download directory
print("Waiting for the file to download...")
download_wait_time = 0
downloaded_filepath = None
# Check the download directory for a new file
while download_wait_time < 30:
files = os.listdir(save_dir)
if files:
# Find the most recent file in the directory
downloaded_filepath = max([os.path.join(save_dir, f) for f in files], key=os.path.getctime)
if downloaded_filepath.endswith('.txt'):
break
time.sleep(1)
download_wait_time += 1
# Move and rename the file if it was found
if downloaded_filepath and downloaded_filepath.endswith('.txt'):
original_filename = os.path.basename(downloaded_filepath)
# Sanitize the original filename
sanitized_filename = original_filename.replace('/', '-').replace('\\', '-')
new_filename = os.path.join(save_dir, f"{rank}-{total_decks}-{sanitized_filename}")
os.rename(downloaded_filepath, new_filename)
print(f"Downloaded deck {rank}/{total_decks}: {new_filename}")
else:
print(f"Failed to download deck {rank}: Download timed out.")
except Exception as e:
print(f"Error downloading deck {rank}: {e}")
def main():
# Set up the base download directory
base_download_dir = os.path.join(os.getcwd(), 'downloads')
if not os.path.exists(base_download_dir):
os.makedirs(base_download_dir)
print(f"Created base download directory at {base_download_dir}")
# Set the date threshold (e.g., September 1, 2023)
date_threshold = datetime(2024, 11, 14).date()
# Initialize the browser
print("Setting up the browser...")
browser = setup_browser(base_download_dir) # Initial browser setup
try:
# Retrieve list of tournaments after the date threshold
tournaments = get_tournaments_after_date(browser, date_threshold)
for tournament_url, tournament_name, tournament_date in tournaments:
print(f"\nProcessing tournament: {tournament_name} dated {tournament_date}")
# Create a specific directory for the tournament
tournament_dir = os.path.join(base_download_dir, tournament_name.replace('/', '-'))
if not os.path.exists(tournament_dir):
os.makedirs(tournament_dir)
print(f"Created tournament directory at {tournament_dir}")
# Reconfigure the browser to use the tournament directory
browser.quit()
browser = setup_browser(tournament_dir) # Reinitialize with tournament-specific directory
# Navigate to the tournament page
print(f"Navigating to the tournament page {tournament_url}...")
browser.get(tournament_url)
time.sleep(3) # Wait for the page to load
# Retrieve tournament info (this will get the decks)
tournament_name, deck_links = get_tournament_info(browser)
# Download each deck
total_decks = len(deck_links)
for idx, (deck_url, deck_name) in enumerate(deck_links, start=1):
print(f"\nProcessing deck {idx}/{total_decks}: {deck_name}")
download_deck(browser, deck_url, idx, total_decks, tournament_dir)
finally:
print("Closing the browser...")
browser.quit()
if __name__ == '__main__':
main()