import os import time import re from datetime import datetime from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.firefox.options import Options from selenium.webdriver.firefox.service import Service from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC def setup_browser(download_dir): firefox_options = Options() firefox_options.set_preference('browser.download.folderList', 2) firefox_options.set_preference('browser.download.dir', download_dir) # Use the tournament-specific directory firefox_options.set_preference('browser.helperApps.neverAsk.saveToDisk', 'text/plain,text/csv,application/octet-stream') firefox_options.set_preference('pdfjs.disabled', True) firefox_options.set_preference('browser.download.manager.showWhenStarting', False) firefox_options.set_preference('browser.download.manager.useWindow', False) firefox_options.set_preference('browser.download.manager.focusWhenStarting', False) firefox_options.set_preference('browser.download.manager.alertOnEXEOpen', False) firefox_options.set_preference('browser.download.manager.showAlertOnComplete', False) firefox_options.set_preference('browser.download.manager.closeWhenDone', False) # Specify the path to geckodriver gecko_service = Service(executable_path='/usr/local/bin/geckodriver') browser = webdriver.Firefox(service=gecko_service, options=firefox_options) return browser def parse_date(date_str): date_str = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str) try: tournament_date = datetime.strptime(date_str, '%B %d %Y').date() print(f"Parsed date: {tournament_date}") return tournament_date except ValueError as e: print(f"Error parsing date '{date_str}': {e}") return None def get_tournaments_after_date(browser, date_threshold): tournaments = [] print("Navigating to the tournaments page...") tournaments_page_url = 'https://edhtop16.com/tournaments?sortBy=DATE' browser.get(tournaments_page_url) time.sleep(3) # Find all tournament entries tournament_entries = browser.find_elements(By.CSS_SELECTOR, "div.group.relative.overflow-hidden.rounded-lg.bg-white.shadow") print(f"Found {len(tournament_entries)} tournaments on the page.") for entry in tournament_entries: try: # Extract tournament link, name, and date link_element = entry.find_element(By.CSS_SELECTOR, 'a.line-clamp-2.text-xl.font-bold.underline') tournament_name = link_element.text tournament_url = link_element.get_attribute('href') date_element = entry.find_element(By.CSS_SELECTOR, 'span') tournament_date_str = date_element.text print(f"Tournament found: {tournament_name}, Date: {tournament_date_str}, URL: {tournament_url}") # Parse the date string tournament_date = parse_date(tournament_date_str) if tournament_date and tournament_date >= date_threshold: tournaments.append((tournament_url, tournament_name, tournament_date)) except Exception as e: print(f"Error processing tournament entry: {e}") print(f"Total tournaments after {date_threshold}: {len(tournaments)}") return tournaments def get_tournament_info(browser): print("Retrieving the tournament name and deck links...") time.sleep(3) # Wait for the page to load # Get the tournament name try: tournament_name_element = browser.find_element(By.TAG_NAME, 'h1') tournament_name = tournament_name_element.text if tournament_name_element else "Tournament" except Exception as e: print(f"Error retrieving tournament name: {e}") tournament_name = "Tournament" tournament_name = tournament_name.replace('/', '-') # Replace invalid filename characters print(f"Tournament Name: {tournament_name}") # Get the list of decks deck_elements = browser.find_elements(By.CSS_SELECTOR, "a.line-clamp-2.text-xl.font-bold.underline") deck_links = [] for deck_element in deck_elements: deck_url = deck_element.get_attribute('href') deck_name = deck_element.text deck_links.append((deck_url, deck_name)) print(f"Total decks found: {len(deck_links)}") return tournament_name, deck_links def download_deck(browser, deck_url, rank, total_decks, save_dir): print(f"Navigating to deck {rank} page...") browser.get(deck_url) try: # Wait for the "More" button and click it print("Waiting for the 'More' button to appear...") WebDriverWait(browser, 10).until(EC.visibility_of_element_located((By.XPATH, "//span[contains(text(), 'More')]/.."))) more_button = browser.find_element(By.XPATH, "//span[contains(text(), 'More')]/..") browser.execute_script("arguments[0].click();", more_button) time.sleep(1) # Wait for and click the "Export" option within the dropdown print("Waiting for the 'Export' option...") WebDriverWait(browser, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[contains(@class, 'dropdown-item') and contains(text(), 'Export')]"))) export_option = browser.find_element(By.XPATH, "//a[contains(@class, 'dropdown-item') and contains(text(), 'Export')]") browser.execute_script("arguments[0].click();", export_option) time.sleep(1) # Wait for and click the "Download for MTGO" link print("Clicking 'Download for MTGO' link...") WebDriverWait(browser, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[contains(@class, 'btn btn-primary') and contains(text(), 'Download for MTGO')]"))) mtgo_download_link = browser.find_element(By.XPATH, "//a[contains(@class, 'btn btn-primary') and contains(text(), 'Download for MTGO')]") mtgo_download_link.click() time.sleep(2) # Wait for the download to initiate # Wait for the downloaded file to appear in the download directory print("Waiting for the file to download...") download_wait_time = 0 downloaded_filepath = None # Check the download directory for a new file while download_wait_time < 30: files = os.listdir(save_dir) if files: # Find the most recent file in the directory downloaded_filepath = max([os.path.join(save_dir, f) for f in files], key=os.path.getctime) if downloaded_filepath.endswith('.txt'): break time.sleep(1) download_wait_time += 1 # Move and rename the file if it was found if downloaded_filepath and downloaded_filepath.endswith('.txt'): original_filename = os.path.basename(downloaded_filepath) # Sanitize the original filename sanitized_filename = original_filename.replace('/', '-').replace('\\', '-') new_filename = os.path.join(save_dir, f"{rank}-{total_decks}-{sanitized_filename}") os.rename(downloaded_filepath, new_filename) print(f"Downloaded deck {rank}/{total_decks}: {new_filename}") else: print(f"Failed to download deck {rank}: Download timed out.") except Exception as e: print(f"Error downloading deck {rank}: {e}") def main(): # Set up the base download directory base_download_dir = os.path.join(os.getcwd(), 'downloads') if not os.path.exists(base_download_dir): os.makedirs(base_download_dir) print(f"Created base download directory at {base_download_dir}") # Set the date threshold (e.g., September 1, 2023) date_threshold = datetime(2024, 11, 14).date() # Initialize the browser print("Setting up the browser...") browser = setup_browser(base_download_dir) # Initial browser setup try: # Retrieve list of tournaments after the date threshold tournaments = get_tournaments_after_date(browser, date_threshold) for tournament_url, tournament_name, tournament_date in tournaments: print(f"\nProcessing tournament: {tournament_name} dated {tournament_date}") # Create a specific directory for the tournament tournament_dir = os.path.join(base_download_dir, tournament_name.replace('/', '-')) if not os.path.exists(tournament_dir): os.makedirs(tournament_dir) print(f"Created tournament directory at {tournament_dir}") # Reconfigure the browser to use the tournament directory browser.quit() browser = setup_browser(tournament_dir) # Reinitialize with tournament-specific directory # Navigate to the tournament page print(f"Navigating to the tournament page {tournament_url}...") browser.get(tournament_url) time.sleep(3) # Wait for the page to load # Retrieve tournament info (this will get the decks) tournament_name, deck_links = get_tournament_info(browser) # Download each deck total_decks = len(deck_links) for idx, (deck_url, deck_name) in enumerate(deck_links, start=1): print(f"\nProcessing deck {idx}/{total_decks}: {deck_name}") download_deck(browser, deck_url, idx, total_decks, tournament_dir) finally: print("Closing the browser...") browser.quit() if __name__ == '__main__': main()