USCLAP/download_tournament.py

import os
import time
import re
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def setup_browser(download_dir):
    firefox_options = Options()
    firefox_options.set_preference('browser.download.folderList', 2)
    firefox_options.set_preference('browser.download.dir', download_dir)  # Use the tournament-specific directory
    firefox_options.set_preference('browser.helperApps.neverAsk.saveToDisk', 'text/plain,text/csv,application/octet-stream')
    firefox_options.set_preference('pdfjs.disabled', True)
    firefox_options.set_preference('browser.download.manager.showWhenStarting', False)
    firefox_options.set_preference('browser.download.manager.useWindow', False)
    firefox_options.set_preference('browser.download.manager.focusWhenStarting', False)
    firefox_options.set_preference('browser.download.manager.alertOnEXEOpen', False)
    firefox_options.set_preference('browser.download.manager.showAlertOnComplete', False)
    firefox_options.set_preference('browser.download.manager.closeWhenDone', False)

    # Specify the path to geckodriver
    gecko_service = Service(executable_path='/usr/local/bin/geckodriver')

    browser = webdriver.Firefox(service=gecko_service, options=firefox_options)
    return browser

def parse_date(date_str):
    date_str = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
    try:
        tournament_date = datetime.strptime(date_str, '%B %d %Y').date()
        print(f"Parsed date: {tournament_date}")
        return tournament_date
    except ValueError as e:
        print(f"Error parsing date '{date_str}': {e}")
        return None

def get_tournaments_after_date(browser, date_threshold):
    tournaments = []

    print("Navigating to the tournaments page...")
    tournaments_page_url = 'https://edhtop16.com/tournaments?sortBy=DATE'
    browser.get(tournaments_page_url)

    time.sleep(3)

    # Find all tournament entries
    tournament_entries = browser.find_elements(By.CSS_SELECTOR, "div.group.relative.overflow-hidden.rounded-lg.bg-white.shadow")
    print(f"Found {len(tournament_entries)} tournaments on the page.")

    for entry in tournament_entries:
        try:
            # Extract tournament link, name, and date
            link_element = entry.find_element(By.CSS_SELECTOR, 'a.line-clamp-2.text-xl.font-bold.underline')
            tournament_name = link_element.text
            tournament_url = link_element.get_attribute('href')

            date_element = entry.find_element(By.CSS_SELECTOR, 'span')
            tournament_date_str = date_element.text
            print(f"Tournament found: {tournament_name}, Date: {tournament_date_str}, URL: {tournament_url}")

            # Parse the date string
            tournament_date = parse_date(tournament_date_str)
            if tournament_date and tournament_date >= date_threshold:
                tournaments.append((tournament_url, tournament_name, tournament_date))
        except Exception as e:
            print(f"Error processing tournament entry: {e}")

    print(f"Total tournaments after {date_threshold}: {len(tournaments)}")
    return tournaments

def get_tournament_info(browser):
    print("Retrieving the tournament name and deck links...")
    time.sleep(3)  # Wait for the page to load

    # Get the tournament name
    try:
        tournament_name_element = browser.find_element(By.TAG_NAME, 'h1')
        tournament_name = tournament_name_element.text if tournament_name_element else "Tournament"
    except Exception as e:
        print(f"Error retrieving tournament name: {e}")
        tournament_name = "Tournament"

    tournament_name = tournament_name.replace('/', '-')  # Replace invalid filename characters
    print(f"Tournament Name: {tournament_name}")

    # Get the list of decks
    deck_elements = browser.find_elements(By.CSS_SELECTOR, "a.line-clamp-2.text-xl.font-bold.underline")
    deck_links = []
    for deck_element in deck_elements:
        deck_url = deck_element.get_attribute('href')
        deck_name = deck_element.text
        deck_links.append((deck_url, deck_name))

    print(f"Total decks found: {len(deck_links)}")
    return tournament_name, deck_links

def download_deck(browser, deck_url, rank, total_decks, save_dir):
    print(f"Navigating to deck {rank} page...")
    browser.get(deck_url)

    try:
        # Wait for the "More" button and click it
        print("Waiting for the 'More' button to appear...")
        WebDriverWait(browser, 10).until(EC.visibility_of_element_located((By.XPATH, "//span[contains(text(), 'More')]/..")))
        more_button = browser.find_element(By.XPATH, "//span[contains(text(), 'More')]/..")
        browser.execute_script("arguments[0].click();", more_button)
        time.sleep(1)

        # Wait for and click the "Export" option within the dropdown
        print("Waiting for the 'Export' option...")
        WebDriverWait(browser, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[contains(@class, 'dropdown-item') and contains(text(), 'Export')]")))
        export_option = browser.find_element(By.XPATH, "//a[contains(@class, 'dropdown-item') and contains(text(), 'Export')]")
        browser.execute_script("arguments[0].click();", export_option)
        time.sleep(1)

        # Wait for and click the "Download for MTGO" link
        print("Clicking 'Download for MTGO' link...")
        WebDriverWait(browser, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[contains(@class, 'btn btn-primary') and contains(text(), 'Download for MTGO')]")))
        mtgo_download_link = browser.find_element(By.XPATH, "//a[contains(@class, 'btn btn-primary') and contains(text(), 'Download for MTGO')]")
        mtgo_download_link.click()
        time.sleep(2)  # Wait for the download to initiate

        # Wait for the downloaded file to appear in the download directory
        print("Waiting for the file to download...")
        download_wait_time = 0
        downloaded_filepath = None

        # Check the download directory for a new file
        while download_wait_time < 30:
            files = os.listdir(save_dir)
            if files:
                # Find the most recent file in the directory
                downloaded_filepath = max([os.path.join(save_dir, f) for f in files], key=os.path.getctime)
                if downloaded_filepath.endswith('.txt'):
                    break
            time.sleep(1)
            download_wait_time += 1

        # Move and rename the file if it was found
        if downloaded_filepath and downloaded_filepath.endswith('.txt'):
            original_filename = os.path.basename(downloaded_filepath)
            # Sanitize the original filename
            sanitized_filename = original_filename.replace('/', '-').replace('\\', '-')
            new_filename = os.path.join(save_dir, f"{rank}-{total_decks}-{sanitized_filename}")
            os.rename(downloaded_filepath, new_filename)
            print(f"Downloaded deck {rank}/{total_decks}: {new_filename}")
        else:
            print(f"Failed to download deck {rank}: Download timed out.")

    except Exception as e:
        print(f"Error downloading deck {rank}: {e}")

def main():
    # Set up the base download directory
    base_download_dir = os.path.join(os.getcwd(), 'downloads')
    if not os.path.exists(base_download_dir):
        os.makedirs(base_download_dir)
        print(f"Created base download directory at {base_download_dir}")

    # Set the date threshold (e.g., September 1, 2023)
    date_threshold = datetime(2024, 11, 14).date()

    # Initialize the browser
    print("Setting up the browser...")
    browser = setup_browser(base_download_dir)  # Initial browser setup
    try:
        # Retrieve list of tournaments after the date threshold
        tournaments = get_tournaments_after_date(browser, date_threshold)

        for tournament_url, tournament_name, tournament_date in tournaments:
            print(f"\nProcessing tournament: {tournament_name} dated {tournament_date}")

            # Create a specific directory for the tournament
            tournament_dir = os.path.join(base_download_dir, tournament_name.replace('/', '-'))
            if not os.path.exists(tournament_dir):
                os.makedirs(tournament_dir)
                print(f"Created tournament directory at {tournament_dir}")

            # Reconfigure the browser to use the tournament directory
            browser.quit()
            browser = setup_browser(tournament_dir)  # Reinitialize with tournament-specific directory

            # Navigate to the tournament page
            print(f"Navigating to the tournament page {tournament_url}...")
            browser.get(tournament_url)
            time.sleep(3)  # Wait for the page to load

            # Retrieve tournament info (this will get the decks)
            tournament_name, deck_links = get_tournament_info(browser)

            # Download each deck
            total_decks = len(deck_links)
            for idx, (deck_url, deck_name) in enumerate(deck_links, start=1):
                print(f"\nProcessing deck {idx}/{total_decks}: {deck_name}")
                download_deck(browser, deck_url, idx, total_decks, tournament_dir)

    finally:
        print("Closing the browser...")
        browser.quit()

if __name__ == '__main__':
    main()