"""
Script pour télécharger et extraire les statistiques d'immatriculation UNI VDL
Version améliorée : scrape la page web pour trouver les vrais liens PDF
(car le format des URLs change tous les mois)
"""

import os
import re
import requests
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
import PyPDF2
from pathlib import Path
from urllib.parse import urljoin

# Configuration
PDF_DIR = r"C:\Users\jeanl\Documents\01. Finance in London\16. Python\uni_vdl\pdf"
OUTPUT_DIR = r"C:\Users\jeanl\Documents\01. Finance in London\16. Python\uni_vdl"
STATS_PAGE_URL = "https://www.univdl.com/statistiques-du-marche/"

# Créer les dossiers s'ils n'existent pas
Path(PDF_DIR).mkdir(parents=True, exist_ok=True)
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)


def scrape_pdf_links():
    """
    Scrape la page des statistiques pour récupérer tous les liens PDF
    """
    print(f"Récupération des liens depuis {STATS_PAGE_URL}...")
    
    try:
        response = requests.get(STATS_PAGE_URL, timeout=30)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Chercher tous les liens PDF
        pdf_links = []
        for link in soup.find_all('a', href=True):
            href = link['href']
            if '.pdf' in href.lower():
                # Convertir en URL absolue si nécessaire
                full_url = urljoin(STATS_PAGE_URL, href)
                
                # Extraire le texte du lien (souvent contient le mois/année)
                link_text = link.get_text(strip=True)
                
                pdf_links.append({
                    'url': full_url,
                    'text': link_text,
                    'filename': os.path.basename(full_url)
                })
        
        print(f"✓ Trouvé {len(pdf_links)} liens PDF")
        return pdf_links
        
    except Exception as e:
        print(f"✗ Erreur lors du scraping: {str(e)}")
        return []


def extract_date_from_filename(filename):
    """
    Essaie d'extraire la date (année-mois) depuis le nom du fichier
    """
    # Patterns possibles
    patterns = [
        r'(\d{4})-(\d{2})',  # 2024-08
        r'(\d{2})-(\d{2})',  # 08-25 (mois-année)
        r'Stat-(\d{2})-(\d{2})',  # Stat-08-25
    ]
    
    for pattern in patterns:
        match = re.search(pattern, filename)
        if match:
            g1, g2 = match.groups()
            # Déterminer si c'est année-mois ou mois-année
            if len(g1) == 4:  # Année en premier
                year, month = int(g1), int(g2)
            else:  # Mois en premier
                month, year = int(g1), int(g2)
                year = 2000 + year if year < 100 else year
            
            return year, month
    
    # Chercher des mois en français
    mois_fr = {
        'janvier': 1, 'fevrier': 2, 'février': 2, 'mars': 3, 'avril': 4,
        'mai': 5, 'juin': 6, 'juillet': 7, 'aout': 8, 'août': 8,
        'septembre': 9, 'septempre': 9, 'octobre': 10, 'novembre': 11, 'decembre': 12, 'décembre': 12
    }
    
    filename_lower = filename.lower()
    for mois_nom, mois_num in mois_fr.items():
        if mois_nom in filename_lower:
            # Chercher l'année
            year_match = re.search(r'20(\d{2})', filename)
            if year_match:
                year = int('20' + year_match.group(1))
                return year, mois_num
    
    return None, None


def download_pdf(url, save_path):
    """
    Télécharge un PDF depuis l'URL donnée
    """
    # Vérifier si le fichier existe déjà
    if os.path.exists(save_path):
        print(f"  ⏭️  Déjà téléchargé, passage au suivant")
        return True
    
    try:
        response = requests.get(url, timeout=30)
        if response.status_code == 200:
            with open(save_path, 'wb') as f:
                f.write(response.content)
            return True
        else:
            print(f"✗ Échec ({response.status_code}): {os.path.basename(save_path)}")
            return False
    except Exception as e:
        print(f"✗ Erreur lors du téléchargement: {str(e)}")
        return False


def extract_data_from_pdf(pdf_path):
    """
    Extrait les chiffres de Camping-cars neufs incl. Vans et Vans du PDF
    Cherche dans le premier tableau "Immatriculations du mois"
    Prend UNIQUEMENT le premier nombre (colonne "n")
    """
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            
            # Extraire le texte de la première page
            if len(pdf_reader.pages) > 0:
                text = pdf_reader.pages[0].extract_text()
            
            camping_cars = None
            vans = None
            
            # Trouver la section "Immatriculations du mois"
            lines = text.split('\n')
            in_first_table = False
            
            for line in lines:
                if 'Immatriculations du mois' in line or 'Immatriculations' in line:
                    in_first_table = True
                    continue
                
                # Sortir du premier tableau si on arrive au suivant
                if in_first_table and ('Cumul sur' in line or 'STATISTIQUES' in line):
                    break
                
                if in_first_table:
                    # Chercher "Camping-cars neufs incl. Vans"
                    # Chercher "Camping-cars neufs incl. Vans"
                    if 'Camping-cars neufs' in line and 'incl' in line and 'Vans' in line:
                        # Split après "Vans" pour avoir juste les nombres
                        after_label = line.split("Vans", 1)[1] if "Vans" in line else ""
                        # Pattern: soit un nombre sans espace (1-4 chiffres), soit format millier (1 chiffre + espace + 3 chiffres)
                        match = re.search(r'^\s*(\d\s\d{3}|\d{1,4})(?:\s|$)', after_label)
                        if match:
                            camping_cars = int(match.group(1).replace(' ', '').replace('\xa0', ''))
                    
                    # Chercher "Vans" seul (pas "incl. Vans")
                    elif line.strip().startswith('Vans') and 'incl' not in line:
                        after_label = line.split("Vans", 1)[1] if "Vans" in line else ""
                        # Pattern: soit un nombre sans espace (1-4 chiffres), soit format millier (1 chiffre + espace + 3 chiffres)
                        match = re.search(r'^\s*(\d\s\d{3}|\d{1,4})(?:\s|$)', after_label)
                        if match:
                            vans = int(match.group(1).replace(' ', '').replace('\xa0', ''))
            
            return camping_cars, vans
            
    except Exception as e:
        print(f"✗ Erreur lors de l'extraction de {os.path.basename(pdf_path)}: {str(e)}")
        return None, None


def main():
    """
    Fonction principale
    """
    print("=" * 70)
    print("TÉLÉCHARGEMENT ET EXTRACTION DES STATISTIQUES UNI VDL")
    print("=" * 70)
    
    # Scraper les liens PDF
    pdf_links = scrape_pdf_links()
    
    if not pdf_links:
        print("\n⚠️  Aucun lien PDF trouvé. Vérifiez votre connexion internet.")
        return None
    
    print(f"\n{len(pdf_links)} PDFs trouvés sur le site")
    
    # Résultats
    results = []
    
    # Traiter chaque PDF
    print("\n" + "-" * 70)
    print("TÉLÉCHARGEMENT ET EXTRACTION")
    print("-" * 70)
    
    for i, pdf_info in enumerate(pdf_links, 1):
        url = pdf_info['url']
        filename = pdf_info['filename']
        save_path = os.path.join(PDF_DIR, filename)
        
        print(f"\n[{i}/{len(pdf_links)}] {filename}")
        
        # Extraire la date du nom de fichier
        year, month = extract_date_from_filename(filename)
        
        if year is None:
            print(f"  ⚠️  Impossible d'extraire la date du nom de fichier")
            # On télécharge quand même pour référence
            download_pdf(url, save_path)
            continue
        
        print(f"  📅 Date détectée: {year}-{month:02d}")
        
        # Télécharger le PDF
        success = download_pdf(url, save_path)
        
        if success:
            print(f"  ✓ Téléchargé")
            
            # Extraire les données
            camping_cars, vans = extract_data_from_pdf(save_path)
            
            if camping_cars is not None and vans is not None:
                print(f"  ✓ Données extraites: CC={camping_cars}, Vans={vans}")
            else:
                print(f"  ⚠️  Impossible d'extraire les données")
            
            results.append({
                'Date': f"{year}-{month:02d}",
                'Année': year,
                'Mois': month,
                'Mois_nom': datetime(year, month, 1).strftime('%B'),
                'Camping-cars neufs incl. Vans': camping_cars,
                'Vans': vans,
                'Fichier': filename,
                'URL': url
            })
    
    # Créer un DataFrame et trier par date
    if not results:
        print("\n⚠️  Aucune donnée extraite.")
        return None
    
    df = pd.DataFrame(results)
    df = df.sort_values('Date', ascending=False)
    
    # Sauvegarder en Excel et CSV
    print("\n" + "-" * 70)
    print("SAUVEGARDE DES RÉSULTATS")
    print("-" * 70)
    
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    excel_path = os.path.join(OUTPUT_DIR, f"statistiques_univdl_{timestamp}.xlsx")
    csv_path = os.path.join(OUTPUT_DIR, f"statistiques_univdl_{timestamp}.csv")
    
    # Excel avec formatage
    with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
        df.to_excel(writer, index=False, sheet_name='Statistiques')
        
        # Formater les colonnes
        worksheet = writer.sheets['Statistiques']
        for column in worksheet.columns:
            max_length = 0
            column_letter = column[0].column_letter
            for cell in column:
                try:
                    if len(str(cell.value)) > max_length:
                        max_length = len(str(cell.value))
                except:
                    pass
            adjusted_width = min(max_length + 2, 50)
            worksheet.column_dimensions[column_letter].width = adjusted_width
    
    # CSV
    df.to_csv(csv_path, index=False, encoding='utf-8-sig')
    
    print(f"✓ Excel sauvegardé: {excel_path}")
    print(f"✓ CSV sauvegardé: {csv_path}")
    
    # Résumé
    print("\n" + "=" * 70)
    print("RÉSUMÉ")
    print("=" * 70)
    print(f"PDFs trouvés: {len(pdf_links)}")
    print(f"PDFs téléchargés: {len(results)}")
    print(f"Données extraites avec succès: {len(df[df['Camping-cars neufs incl. Vans'].notna()])}")
    
    # Statistiques sur les données manquantes
    missing = len(df[df['Camping-cars neufs incl. Vans'].isna()])
    if missing > 0:
        print(f"\n⚠️  {missing} PDF(s) sans données extraites (vérifier le format)")
    
    print("\n📊 Aperçu des résultats:")
    print(df[['Date', 'Camping-cars neufs incl. Vans', 'Vans']].head(10).to_string(index=False))
    
    return df


if __name__ == "__main__":
    try:
        df = main()
    except KeyboardInterrupt:
        print("\n\n⚠️  Arrêt demandé par l'utilisateur")
    except Exception as e:
        print(f"\n\n❌ Erreur inattendue: {str(e)}")
        import traceback
        traceback.print_exc()