You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
232 lines
8.6 KiB
232 lines
8.6 KiB
import requests
|
|
from bs4 import BeautifulSoup
|
|
import os
|
|
import time
|
|
|
|
# Liste de films associée à la liste de personnages fournie plus tôt
|
|
films = [
|
|
"Harry Potter", "Star Wars", "Le Seigneur des Anneaux", "Batman", "Spider-Man",
|
|
"Iron Man", "Les Avengers", "Doctor Strange", "Deadpool", "X-Men",
|
|
"Black Panther", "Aquaman", "La Ligue des Justiciers", "Shazam", "Les Gardiens de la Galaxie",
|
|
"Le Parrain", "Le Chevalier Noir", "Inception", "Fight Club", "Pulp Fiction",
|
|
"Forrest Gump", "Matrix", "Jurassic Park", "Gladiator", "Le Silence des Agneaux",
|
|
"La Liste de Schindler", "Braveheart", "La Ligne Verte", "Il faut sauver le soldat Ryan", "Thor",
|
|
"Captain America", "Logan", "Joker", "Wonder Woman", "L'Homme d'Acier",
|
|
"Hunger Games", "Divergente", "Les Animaux Fantastiques", "Le Hobbit",
|
|
"Pirates des Caraïbes", "Toy Story", "La Reine des Neiges", "Le Roi Lion", "La Belle et la Bête",
|
|
"Aladdin", "Mulan", "Cendrillon", "La Belle au bois dormant", "Blanche-Neige",
|
|
"Raiponce", "Vaiana", "Zootopie", "Vice-versa", "Le Monde de Nemo",
|
|
"Les Indestructibles", "Ratatouille", "WALL-E", "Là-haut", "Coco",
|
|
"Monstres & Cie", "Cars", "Madagascar", "Shrek", "Kung Fu Panda",
|
|
"Dragons", "L'Âge de glace", "Les Croods"
|
|
]
|
|
|
|
# Petite base de données locale avec les années des films
|
|
film_years = {
|
|
"Harry Potter": "2001",
|
|
"Star Wars": "1977",
|
|
"Le Seigneur des Anneaux": "2001",
|
|
"Batman": "1989",
|
|
"Spider-Man": "2002",
|
|
"Iron Man": "2008",
|
|
"Les Avengers": "2012",
|
|
"Doctor Strange": "2016",
|
|
"Deadpool": "2016",
|
|
"X-Men": "2000",
|
|
"Black Panther": "2018",
|
|
"Aquaman": "2018",
|
|
"La Ligue des Justiciers": "2017",
|
|
"Shazam": "2019",
|
|
"Les Gardiens de la Galaxie": "2014",
|
|
"Le Parrain": "1972",
|
|
"Le Chevalier Noir": "2008",
|
|
"Inception": "2010",
|
|
"Fight Club": "1999",
|
|
"Pulp Fiction": "1994",
|
|
"Forrest Gump": "1994",
|
|
"Matrix": "1999",
|
|
"Jurassic Park": "1993",
|
|
"Gladiator": "2000",
|
|
"Le Silence des Agneaux": "1991",
|
|
"La Liste de Schindler": "1993",
|
|
"Braveheart": "1995",
|
|
"La Ligne Verte": "1999",
|
|
"Il faut sauver le soldat Ryan": "1998",
|
|
"Thor": "2011",
|
|
"Captain America": "2011",
|
|
"Logan": "2017",
|
|
"Joker": "2019",
|
|
"Wonder Woman": "2017",
|
|
"L'Homme d'Acier": "2013",
|
|
"Hunger Games": "2012",
|
|
"Divergente": "2014",
|
|
"Les Animaux Fantastiques": "2016",
|
|
"Le Hobbit": "2012",
|
|
"Pirates des Caraïbes": "2003",
|
|
"Toy Story": "1995",
|
|
"La Reine des Neiges": "2013",
|
|
"Le Roi Lion": "1994",
|
|
"La Belle et la Bête": "1991",
|
|
"Aladdin": "1992",
|
|
"Mulan": "1998",
|
|
"Cendrillon": "1950",
|
|
"La Belle au bois dormant": "1959",
|
|
"Blanche-Neige": "1937",
|
|
"Raiponce": "2010",
|
|
"Vaiana": "2016",
|
|
"Zootopie": "2016",
|
|
"Vice-versa": "2015",
|
|
"Le Monde de Nemo": "2003",
|
|
"Les Indestructibles": "2004",
|
|
"Ratatouille": "2007",
|
|
"WALL-E": "2008",
|
|
"Là-haut": "2009",
|
|
"Coco": "2017",
|
|
"Monstres & Cie": "2001",
|
|
"Cars": "2006",
|
|
"Madagascar": "2005",
|
|
"Shrek": "2001",
|
|
"Kung Fu Panda": "2008",
|
|
"Dragons": "2010",
|
|
"L'Âge de glace": "2002",
|
|
"Les Croods": "2013"
|
|
}
|
|
|
|
# Fonction pour rechercher une image sur Bing
|
|
def search_image_bing(personnage, film):
|
|
url = f"https://www.bing.com/images/search?q={personnage.replace(' ', '+')}+{film.replace(' ', '+')}"
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
|
|
|
|
response = requests.get(url, headers=headers)
|
|
soup = BeautifulSoup(response.text, 'lxml')
|
|
|
|
# Rechercher la première image
|
|
image = soup.find('img', {'class': 'mimg'})
|
|
if image and 'src' in image.attrs:
|
|
return image['src']
|
|
return None
|
|
|
|
# Vérifier si une citation est en français
|
|
common_french_words = {
|
|
'le', 'la', 'les', 'et', 'est', 'pour', 'que', 'qui', 'un', 'une', 'de', 'du', 'ce', 'cela',
|
|
'dans', 'sur', 'par', 'avec', 'en', 'au', 'aux', 'des', 'ou', 'mais', 'si', 'ne', 'pas',
|
|
'il', 'elle', 'ils', 'elles', 'nous', 'vous', 'tu', 'je', 'me', 'te', 'se', 'sont', 'été',
|
|
'avoir', 'être', 'faire', 'dire', 'pouvoir', 'aller', 'venir', 'voir', 'vouloir', 'savoir',
|
|
'bien', 'tout', 'mon', 'ton', 'son', 'notre', 'votre', 'leur', 'plus', 'aussi', 'comme',
|
|
'faut', 'a', 'le', 'la', 'les'
|
|
}
|
|
|
|
def is_french(citation_text):
|
|
words_in_citation = citation_text.lower().split()
|
|
french_word_count = sum(1 for word in words_in_citation if word in common_french_words)
|
|
|
|
# On considère que c'est français si au moins 50% des mots sont reconnus comme français
|
|
return french_word_count / len(words_in_citation) >= 0.5
|
|
|
|
# Fonction pour obtenir l'année du film depuis la base de données locale ou via Google
|
|
def get_film_year(film_name):
|
|
if film_name in film_years:
|
|
return film_years[film_name]
|
|
return '0' # Retourner 0 si l'année est inconnue
|
|
|
|
# Fonction pour scraper les citations d'un film
|
|
def scrape_citations(film_name):
|
|
url = "https://www.kaakook.fr/rechercher"
|
|
data = {
|
|
"extrfilm": film_name, # Nom du film
|
|
"extrcitation": "" # On laisse vide pour chercher toutes les citations du film
|
|
}
|
|
|
|
# Faire la requête POST
|
|
response = requests.post(url, data=data)
|
|
if response.status_code != 200:
|
|
print(f"Erreur avec le film {film_name}. Status code: {response.status_code}")
|
|
return []
|
|
|
|
# Parser la réponse HTML
|
|
soup = BeautifulSoup(response.content, "html.parser")
|
|
citations = []
|
|
|
|
# Extraire les citations et autres informations
|
|
articles = soup.find_all('article')
|
|
for article in articles:
|
|
citation_text = article.find('a').get_text().strip().replace("\n", " ").replace(" ", " ")
|
|
|
|
# Ignorer les citations contenant du HTML <br> (donc multi-lignes)
|
|
if "<br>" in str(article):
|
|
continue
|
|
|
|
# Vérifier si la citation commence par un tiret (dialogue)
|
|
if citation_text.startswith('-'):
|
|
continue
|
|
|
|
# Vérifier si la citation est en français
|
|
if not is_french(citation_text):
|
|
continue
|
|
|
|
source = article.find('cite').get_text().strip()
|
|
|
|
# Récupérer le personnage (ignorer si le personnage est inconnu)
|
|
footer_links = article.find('footer').find_all('a')
|
|
character = footer_links[1].get_text().strip() if len(footer_links) > 1 else 'Inconnu'
|
|
|
|
# Si le personnage est inconnu, on ignore cette citation
|
|
if character == 'Inconnu':
|
|
continue
|
|
|
|
# Récupérer l'image associée
|
|
img_tag = article.find('img')
|
|
image_url = img_tag['src'] if img_tag else 'images/default.jpg'
|
|
|
|
|
|
# Télécharger l'image si elle n'existe pas déjà
|
|
if image_url == 'images/default.jpg':
|
|
image_url = search_image_bing(character, film_name)
|
|
if image_url:
|
|
download_image(image_url, character)
|
|
|
|
# Récupérer l'année via la base de données locale ou Google
|
|
year = get_film_year(film_name)
|
|
|
|
# Stocker les résultats sous la forme "citation; source; character; year; image"
|
|
citations.append(f"\n{citation_text}; {source}; {character}; {year}; {image_url}")
|
|
|
|
return citations
|
|
|
|
# Fonction pour télécharger une image
|
|
def download_image(image_url, character_name):
|
|
image_name = f"{character_name.replace(' ', '_')}.jpg"
|
|
image_path = os.path.join("images", image_name)
|
|
|
|
# Vérifier si le dossier "images" existe, sinon le créer
|
|
if not os.path.exists("images"):
|
|
os.makedirs("images")
|
|
|
|
# Si l'image n'existe pas déjà, la télécharger
|
|
if not os.path.exists(image_path):
|
|
try:
|
|
img_data = requests.get(image_url).content
|
|
with open(image_path, 'wb') as handler:
|
|
handler.write(img_data)
|
|
print(f"Image téléchargée : {image_path}")
|
|
except Exception as e:
|
|
print(f"Erreur lors du téléchargement de l'image {image_url} : {e}")
|
|
|
|
# Ecrire les résultats dans un fichier txt unique
|
|
def save_citations_to_file(citations):
|
|
with open("citation.txt", "a", encoding="utf-8") as file:
|
|
for citation in citations:
|
|
file.write(citation + "\n")
|
|
|
|
# Itérer sur la liste des films et récupérer les citations
|
|
for film in films:
|
|
print(f"Recherche des citations pour le film : {film}")
|
|
citations = scrape_citations(film)
|
|
if citations:
|
|
save_citations_to_file(citations)
|
|
print(f"Ajouté {len(citations)} citation(s) pour {film}.")
|
|
else:
|
|
print(f"Aucune citation trouvée pour {film}.")
|
|
time.sleep(2) # Petite pause pour éviter d'envoyer trop de requêtes rapidement
|