from typing import List, Optional, Tuple
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.webdriver import WebDriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
import sys
import os
from urllib.parse import urlparse


sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from my_slack.my_slack import send_slack
import requests
import time
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from typing import List, Set, Tuple

# WebDriver のセットアップ関数
def setup_chrome_driver() -> WebDriver:
    """Chrome WebDriver の設定を行い、初期化されたドライバーインスタンスを返す"""
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--headless')  # サーバー上で実行する場合
    options.add_argument('--disable-gpu')  # ヘッドレスモードでは不要だが念のため
    options.add_argument('--disable-blink-features=AutomationControlled')  # Bot対策回避
    options.add_argument('--lang=ja')
    options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')

    try:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=options)
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")  # Bot検知回避
        return driver
    except Exception as e:
        print(f"Error setting up Chrome driver: {str(e)}")
        send_slack(f"ChromeDriverのセットアップに失敗しました: {str(e)}")
        raise

# urlのディレクトリの深さを取得
def get_url_depth(url: str) -> int:
    """Calculate directory depth of URL."""
    path = urlparse(url).path.strip('/').split('/')
    return len(path)

# 除外すべきURLかどうかを判定
def should_exclude_url(url: str) -> bool:
    """
    除外すべきURLかどうかを判定
    - リソースファイル（画像、アイコン、CSSなど）
    - 言語パス（/en/など）を含むURL
    - httpスキームのURL
    """
    resource_extensions = {
        '.png', '.jpg', '.jpeg', '.gif', '.ico', '.svg',
        '.css', '.js', '.woff', '.woff2', '.ttf', '.pdf', '.php',
        '.webmanifest'
    }
    parsed = urlparse(url)
    path_parts = parsed.path.lower().split('/')
    
    # リソースファイルのチェック
    is_resource = any(parsed.path.lower().endswith(ext) for ext in resource_extensions)
    
    # 言語パスのチェック（/en/などを含むかどうか）
    has_language_path = any(part == 'en' for part in path_parts)
    
    # httpスキームのチェック
    is_http = parsed.scheme == 'http'
    
    return is_resource or has_language_path or is_http

# URLを正規化
def normalize_url(url: str) -> str:
    """
    Normalize URL by:
    - Removing fragments and query params
    - Ensuring consistent trailing slash handling
    - Converting to lowercase
    """
    parsed = urlparse(url)
    # ベースURLの場合は末尾にスラッシュを付ける
    if not parsed.path or parsed.path == '/':
        path = '/'
    else:
        # その他のURLは末尾のスラッシュを削除
        path = parsed.path.rstrip('/')
    
    return f"{parsed.scheme}://{parsed.netloc.lower()}{path}"


# Seleniumを使用して動的コンテンツからURLを抽出
def get_dynamic_urls(url: str, driver: webdriver.Chrome) -> Set[str]:
    """Extract URLs using Selenium for dynamic content."""
    dynamic_urls = set()
    try:
        driver.get(url)
        time.sleep(2)  # Wait for dynamic content
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        for link in soup.find_all(['a', 'link'], href=True):
            href = link['href']
            full_url = urljoin(url, href.split('#')[0])
            parsed = urlparse(full_url)
            if parsed.netloc == urlparse(url).netloc:
                dynamic_urls.add(full_url)

    except Exception as e:
        print(f"Error in dynamic extraction: {e}")
    return dynamic_urls

# サブディレクトリのURLを取得
def get_subdirectory_urls(url: str, max_urls: int = 100) -> List[str]:
    """
    Extract URLs using both static and dynamic methods.
    Combines results, removes duplicates, and sorts by depth.
    
    Args:
        url (str): The base URL to start crawling from
        max_urls (int): Maximum number of URLs to collect (default: 300)
        
    Returns:
        List[str]: Combined list of URLs, sorted by depth (shallow first)
    """
    visited = set()
    result = []  # Store (depth, url) tuples

    def crawl_link(current_url: str, limit: int = max_urls):
        # If we've reached the URL limit, stop
        if len(result) >= limit:
            return
            
        # Normalize URL and check if visited
        normalized_url = normalize_url(current_url)
        if normalized_url not in visited:
            visited.add(normalized_url)
            depth = get_url_depth(normalized_url)
            result.append((depth, normalized_url))

        # Attempt to retrieve and parse subdirectory links
        try:
            resp = requests.get(current_url, timeout=10)
            if resp.status_code != 200:
                return

            soup = BeautifulSoup(resp.text, 'html.parser')
            base_domain = urlparse(url).netloc
            base_path = urlparse(url).path

            # For each link in the page
            for link_tag in soup.find_all('a', href=True):
                if len(result) >= limit:
                    break  # we already have enough URLs

                href = link_tag['href']
                if '#' in href or '?' in href:
                    continue
                    
                abs_url = urljoin(current_url, href)
                normalized_abs_url = normalize_url(abs_url)
                parsed = urlparse(normalized_abs_url)

                # Check if URL belongs to same domain (removed base_path restriction)
                if parsed.netloc == base_domain:
                    if normalized_abs_url not in visited:
                        crawl_link(normalized_abs_url, limit)

        except Exception:
            pass

    # Get static URLs through recursion
    crawl_link(url, max_urls)
    static_urls = {url_tuple[1] for url_tuple in result}
    print(f"Found {len(static_urls)} static URLs")
    
    # Setup Selenium for dynamic extraction
    chrome_options = Options()
    chrome_options.add_argument('--headless=new')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    
    try:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        # Get dynamic URLs
        dynamic_urls = get_dynamic_urls(url, driver)

    finally:
        driver.quit()
    
    # Combine static and dynamic URLs
    all_urls = static_urls.union(dynamic_urls)
    
    # URLの重複チェックと正規化、リソースファイルの除外
    print("\nRemoving duplicates, normalizing URLs, and filtering resource files...")
    normalized_urls = set()
    
    for url in all_urls:
        # 除外すべきURLはスキップ
        if should_exclude_url(url):
            continue
        # URLを正規化（末尾のスラッシュの処理を含む）
        norm_url = normalize_url(url)
        normalized_urls.add(norm_url)
        
    # URLの深さを計算してソート
    url_depths = [(url, get_url_depth(url)) for url in normalized_urls]
    sorted_results = sorted(url_depths, key=lambda x: (x[1], x[0]))[:50]
    
    # URLだけを取り出して新しい変数に格納
    url_list = [url for url, _ in sorted_results]
    
    print("-" * 50)
    print("\nURLs only:")
    print(url_list)
    # Return URLs in depth order

    return url_list

# 複数のURLの内容を取得
def fetch_multiple_urls(url: str) -> Optional[List[Tuple[str, str, List[str]]]]:
    
    driver = None
    try:
        print(f"Starting scraping process for URL: {url}")
        driver = setup_chrome_driver()

        # Step 1: 提供されたURLのサブディレクトリを収集
        url_list = get_subdirectory_urls(url)
        if not url_list:
            print("No URLs found to scrape")
            return None

        # Step 2: 収集したURLの内容をスクレイピング
        data = []
        for url in url_list:
            driver.get(url)
            time.sleep(2)  # ページのロードを待機
            page_soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            # 必要な情報を抽出
            title = page_soup.title.string if page_soup.title else 'No title'
            extracted_content = url_scraping(driver, url)
            # 他の必要な情報もここで抽出
            data.append([url, title, extracted_content])
        
        return data if data else None

    except Exception as e:
        print(f"Error occurred while scraping {url}: {str(e)}")
        send_slack(f"Seleniumでエラーが発生しました。\nURL: {url}\nエラー: {str(e)}\nファイル: fetch_multiple_urls.py")
        return None
    
    finally:
        if driver:
            driver.quit()
            print("Browser session closed successfully")

# 指定したURLの内容をスクレイピング
def url_scraping(driver: WebDriver, url: str) -> List[str]:
    """指定されたURLのページ内容を取得する"""
    try:
        driver.get(url)
        time.sleep(2)  # ページロード待機

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        content = extract_page_content(soup)

        return content

    except WebDriverException as e:
        print(f"Error while fetching URL {url}: {str(e)}")
        return []

# BeautifulSoupオブジェクトからコンテンツを抽出
def extract_page_content(soup: BeautifulSoup) -> List[str]:
    """BeautifulSoupオブジェクトから整形されたコンテンツを抽出する"""
    content = []
    elements = soup.find_all(['p', 'ul', 'ol'])
    
    for element in elements:
        if element.name == 'p':
            # 段落（pタグ）のテキストを取得
            text = element.get_text(separator=' ', strip=True)
            if text:
                content.append(text)

        elif element.name in ['ul', 'ol']:
            # 箇条書きリスト（ul, ol）の処理
            list_items = element.find_all('li')
            for item in list_items:
                text = item.get_text(separator=' ', strip=True)
                if text:
                    content.append(f"- {text}")  # 各リスト項目に `-` をつけて見やすくする

    return content

# スクリプトが直接実行されたときのみ実行
if __name__ == "__main__":
    url =  "https://newgrads.visional.inc/"
    fetch_multiple_urls(url)