parent
55d1d5372e
commit
d792018fb2
@ -1,231 +0,0 @@
|
|||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
|
|
||||||
# Liste de films associée à la liste de personnages fournie plus tôt
|
|
||||||
films = [
|
|
||||||
"Harry Potter", "Star Wars", "Le Seigneur des Anneaux", "Batman", "Spider-Man",
|
|
||||||
"Iron Man", "Les Avengers", "Doctor Strange", "Deadpool", "X-Men",
|
|
||||||
"Black Panther", "Aquaman", "La Ligue des Justiciers", "Shazam", "Les Gardiens de la Galaxie",
|
|
||||||
"Le Parrain", "Le Chevalier Noir", "Inception", "Fight Club", "Pulp Fiction",
|
|
||||||
"Forrest Gump", "Matrix", "Jurassic Park", "Gladiator", "Le Silence des Agneaux",
|
|
||||||
"La Liste de Schindler", "Braveheart", "La Ligne Verte", "Il faut sauver le soldat Ryan", "Thor",
|
|
||||||
"Captain America", "Logan", "Joker", "Wonder Woman", "L'Homme d'Acier",
|
|
||||||
"Hunger Games", "Divergente", "Les Animaux Fantastiques", "Le Hobbit",
|
|
||||||
"Pirates des Caraïbes", "Toy Story", "La Reine des Neiges", "Le Roi Lion", "La Belle et la Bête",
|
|
||||||
"Aladdin", "Mulan", "Cendrillon", "La Belle au bois dormant", "Blanche-Neige",
|
|
||||||
"Raiponce", "Vaiana", "Zootopie", "Vice-versa", "Le Monde de Nemo",
|
|
||||||
"Les Indestructibles", "Ratatouille", "WALL-E", "Là-haut", "Coco",
|
|
||||||
"Monstres & Cie", "Cars", "Madagascar", "Shrek", "Kung Fu Panda",
|
|
||||||
"Dragons", "L'Âge de glace", "Les Croods"
|
|
||||||
]
|
|
||||||
|
|
||||||
# Petite base de données locale avec les années des films
|
|
||||||
film_years = {
|
|
||||||
"Harry Potter": "2001",
|
|
||||||
"Star Wars": "1977",
|
|
||||||
"Le Seigneur des Anneaux": "2001",
|
|
||||||
"Batman": "1989",
|
|
||||||
"Spider-Man": "2002",
|
|
||||||
"Iron Man": "2008",
|
|
||||||
"Les Avengers": "2012",
|
|
||||||
"Doctor Strange": "2016",
|
|
||||||
"Deadpool": "2016",
|
|
||||||
"X-Men": "2000",
|
|
||||||
"Black Panther": "2018",
|
|
||||||
"Aquaman": "2018",
|
|
||||||
"La Ligue des Justiciers": "2017",
|
|
||||||
"Shazam": "2019",
|
|
||||||
"Les Gardiens de la Galaxie": "2014",
|
|
||||||
"Le Parrain": "1972",
|
|
||||||
"Le Chevalier Noir": "2008",
|
|
||||||
"Inception": "2010",
|
|
||||||
"Fight Club": "1999",
|
|
||||||
"Pulp Fiction": "1994",
|
|
||||||
"Forrest Gump": "1994",
|
|
||||||
"Matrix": "1999",
|
|
||||||
"Jurassic Park": "1993",
|
|
||||||
"Gladiator": "2000",
|
|
||||||
"Le Silence des Agneaux": "1991",
|
|
||||||
"La Liste de Schindler": "1993",
|
|
||||||
"Braveheart": "1995",
|
|
||||||
"La Ligne Verte": "1999",
|
|
||||||
"Il faut sauver le soldat Ryan": "1998",
|
|
||||||
"Thor": "2011",
|
|
||||||
"Captain America": "2011",
|
|
||||||
"Logan": "2017",
|
|
||||||
"Joker": "2019",
|
|
||||||
"Wonder Woman": "2017",
|
|
||||||
"L'Homme d'Acier": "2013",
|
|
||||||
"Hunger Games": "2012",
|
|
||||||
"Divergente": "2014",
|
|
||||||
"Les Animaux Fantastiques": "2016",
|
|
||||||
"Le Hobbit": "2012",
|
|
||||||
"Pirates des Caraïbes": "2003",
|
|
||||||
"Toy Story": "1995",
|
|
||||||
"La Reine des Neiges": "2013",
|
|
||||||
"Le Roi Lion": "1994",
|
|
||||||
"La Belle et la Bête": "1991",
|
|
||||||
"Aladdin": "1992",
|
|
||||||
"Mulan": "1998",
|
|
||||||
"Cendrillon": "1950",
|
|
||||||
"La Belle au bois dormant": "1959",
|
|
||||||
"Blanche-Neige": "1937",
|
|
||||||
"Raiponce": "2010",
|
|
||||||
"Vaiana": "2016",
|
|
||||||
"Zootopie": "2016",
|
|
||||||
"Vice-versa": "2015",
|
|
||||||
"Le Monde de Nemo": "2003",
|
|
||||||
"Les Indestructibles": "2004",
|
|
||||||
"Ratatouille": "2007",
|
|
||||||
"WALL-E": "2008",
|
|
||||||
"Là-haut": "2009",
|
|
||||||
"Coco": "2017",
|
|
||||||
"Monstres & Cie": "2001",
|
|
||||||
"Cars": "2006",
|
|
||||||
"Madagascar": "2005",
|
|
||||||
"Shrek": "2001",
|
|
||||||
"Kung Fu Panda": "2008",
|
|
||||||
"Dragons": "2010",
|
|
||||||
"L'Âge de glace": "2002",
|
|
||||||
"Les Croods": "2013"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Fonction pour rechercher une image sur Bing
|
|
||||||
def search_image_bing(personnage, film):
|
|
||||||
url = f"https://www.bing.com/images/search?q={personnage.replace(' ', '+')}+{film.replace(' ', '+')}"
|
|
||||||
headers = {
|
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
|
|
||||||
|
|
||||||
response = requests.get(url, headers=headers)
|
|
||||||
soup = BeautifulSoup(response.text, 'lxml')
|
|
||||||
|
|
||||||
# Rechercher la première image
|
|
||||||
image = soup.find('img', {'class': 'mimg'})
|
|
||||||
if image and 'src' in image.attrs:
|
|
||||||
return image['src']
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Vérifier si une citation est en français
|
|
||||||
common_french_words = {
|
|
||||||
'le', 'la', 'les', 'et', 'est', 'pour', 'que', 'qui', 'un', 'une', 'de', 'du', 'ce', 'cela',
|
|
||||||
'dans', 'sur', 'par', 'avec', 'en', 'au', 'aux', 'des', 'ou', 'mais', 'si', 'ne', 'pas',
|
|
||||||
'il', 'elle', 'ils', 'elles', 'nous', 'vous', 'tu', 'je', 'me', 'te', 'se', 'sont', 'été',
|
|
||||||
'avoir', 'être', 'faire', 'dire', 'pouvoir', 'aller', 'venir', 'voir', 'vouloir', 'savoir',
|
|
||||||
'bien', 'tout', 'mon', 'ton', 'son', 'notre', 'votre', 'leur', 'plus', 'aussi', 'comme',
|
|
||||||
'faut', 'a', 'le', 'la', 'les'
|
|
||||||
}
|
|
||||||
|
|
||||||
def is_french(citation_text):
|
|
||||||
words_in_citation = citation_text.lower().split()
|
|
||||||
french_word_count = sum(1 for word in words_in_citation if word in common_french_words)
|
|
||||||
|
|
||||||
# On considère que c'est français si au moins 50% des mots sont reconnus comme français
|
|
||||||
return french_word_count / len(words_in_citation) >= 0.5
|
|
||||||
|
|
||||||
# Fonction pour obtenir l'année du film depuis la base de données locale ou via Google
|
|
||||||
def get_film_year(film_name):
|
|
||||||
if film_name in film_years:
|
|
||||||
return film_years[film_name]
|
|
||||||
return '0' # Retourner 0 si l'année est inconnue
|
|
||||||
|
|
||||||
# Fonction pour scraper les citations d'un film
|
|
||||||
def scrape_citations(film_name):
|
|
||||||
url = "https://www.kaakook.fr/rechercher"
|
|
||||||
data = {
|
|
||||||
"extrfilm": film_name, # Nom du film
|
|
||||||
"extrcitation": "" # On laisse vide pour chercher toutes les citations du film
|
|
||||||
}
|
|
||||||
|
|
||||||
# Faire la requête POST
|
|
||||||
response = requests.post(url, data=data)
|
|
||||||
if response.status_code != 200:
|
|
||||||
print(f"Erreur avec le film {film_name}. Status code: {response.status_code}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Parser la réponse HTML
|
|
||||||
soup = BeautifulSoup(response.content, "html.parser")
|
|
||||||
citations = []
|
|
||||||
|
|
||||||
# Extraire les citations et autres informations
|
|
||||||
articles = soup.find_all('article')
|
|
||||||
for article in articles:
|
|
||||||
citation_text = article.find('a').get_text().strip().replace("\n", " ").replace(" ", " ")
|
|
||||||
|
|
||||||
# Ignorer les citations contenant du HTML <br> (donc multi-lignes)
|
|
||||||
if "<br>" in str(article):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Vérifier si la citation commence par un tiret (dialogue)
|
|
||||||
if citation_text.startswith('-'):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Vérifier si la citation est en français
|
|
||||||
if not is_french(citation_text):
|
|
||||||
continue
|
|
||||||
|
|
||||||
source = article.find('cite').get_text().strip()
|
|
||||||
|
|
||||||
# Récupérer le personnage (ignorer si le personnage est inconnu)
|
|
||||||
footer_links = article.find('footer').find_all('a')
|
|
||||||
character = footer_links[1].get_text().strip() if len(footer_links) > 1 else 'Inconnu'
|
|
||||||
|
|
||||||
# Si le personnage est inconnu, on ignore cette citation
|
|
||||||
if character == 'Inconnu':
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Récupérer l'image associée
|
|
||||||
img_tag = article.find('img')
|
|
||||||
image_url = img_tag['src'] if img_tag else 'images/default.jpg'
|
|
||||||
|
|
||||||
|
|
||||||
# Télécharger l'image si elle n'existe pas déjà
|
|
||||||
if image_url == 'images/default.jpg':
|
|
||||||
image_url = search_image_bing(character, film_name)
|
|
||||||
if image_url:
|
|
||||||
download_image(image_url, character)
|
|
||||||
|
|
||||||
# Récupérer l'année via la base de données locale ou Google
|
|
||||||
year = get_film_year(film_name)
|
|
||||||
|
|
||||||
# Stocker les résultats sous la forme "citation; source; character; year; image"
|
|
||||||
citations.append(f"\n{citation_text}; {source}; {character}; {year}; {image_url}")
|
|
||||||
|
|
||||||
return citations
|
|
||||||
|
|
||||||
# Fonction pour télécharger une image
|
|
||||||
def download_image(image_url, character_name):
|
|
||||||
image_name = f"{character_name.replace(' ', '_')}.jpg"
|
|
||||||
image_path = os.path.join("images", image_name)
|
|
||||||
|
|
||||||
# Vérifier si le dossier "images" existe, sinon le créer
|
|
||||||
if not os.path.exists("images"):
|
|
||||||
os.makedirs("images")
|
|
||||||
|
|
||||||
# Si l'image n'existe pas déjà, la télécharger
|
|
||||||
if not os.path.exists(image_path):
|
|
||||||
try:
|
|
||||||
img_data = requests.get(image_url).content
|
|
||||||
with open(image_path, 'wb') as handler:
|
|
||||||
handler.write(img_data)
|
|
||||||
print(f"Image téléchargée : {image_path}")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Erreur lors du téléchargement de l'image {image_url} : {e}")
|
|
||||||
|
|
||||||
# Ecrire les résultats dans un fichier txt unique
|
|
||||||
def save_citations_to_file(citations):
|
|
||||||
with open("citation.txt", "a", encoding="utf-8") as file:
|
|
||||||
for citation in citations:
|
|
||||||
file.write(citation + "\n")
|
|
||||||
|
|
||||||
# Itérer sur la liste des films et récupérer les citations
|
|
||||||
for film in films:
|
|
||||||
print(f"Recherche des citations pour le film : {film}")
|
|
||||||
citations = scrape_citations(film)
|
|
||||||
if citations:
|
|
||||||
save_citations_to_file(citations)
|
|
||||||
print(f"Ajouté {len(citations)} citation(s) pour {film}.")
|
|
||||||
else:
|
|
||||||
print(f"Aucune citation trouvée pour {film}.")
|
|
||||||
time.sleep(2) # Petite pause pour éviter d'envoyer trop de requêtes rapidement
|
|
Loading…
Reference in new issue