parent
55d1d5372e
commit
d792018fb2
@ -1,231 +0,0 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
import time
|
||||
|
||||
# Liste de films associée à la liste de personnages fournie plus tôt
|
||||
films = [
|
||||
"Harry Potter", "Star Wars", "Le Seigneur des Anneaux", "Batman", "Spider-Man",
|
||||
"Iron Man", "Les Avengers", "Doctor Strange", "Deadpool", "X-Men",
|
||||
"Black Panther", "Aquaman", "La Ligue des Justiciers", "Shazam", "Les Gardiens de la Galaxie",
|
||||
"Le Parrain", "Le Chevalier Noir", "Inception", "Fight Club", "Pulp Fiction",
|
||||
"Forrest Gump", "Matrix", "Jurassic Park", "Gladiator", "Le Silence des Agneaux",
|
||||
"La Liste de Schindler", "Braveheart", "La Ligne Verte", "Il faut sauver le soldat Ryan", "Thor",
|
||||
"Captain America", "Logan", "Joker", "Wonder Woman", "L'Homme d'Acier",
|
||||
"Hunger Games", "Divergente", "Les Animaux Fantastiques", "Le Hobbit",
|
||||
"Pirates des Caraïbes", "Toy Story", "La Reine des Neiges", "Le Roi Lion", "La Belle et la Bête",
|
||||
"Aladdin", "Mulan", "Cendrillon", "La Belle au bois dormant", "Blanche-Neige",
|
||||
"Raiponce", "Vaiana", "Zootopie", "Vice-versa", "Le Monde de Nemo",
|
||||
"Les Indestructibles", "Ratatouille", "WALL-E", "Là-haut", "Coco",
|
||||
"Monstres & Cie", "Cars", "Madagascar", "Shrek", "Kung Fu Panda",
|
||||
"Dragons", "L'Âge de glace", "Les Croods"
|
||||
]
|
||||
|
||||
# Petite base de données locale avec les années des films
|
||||
film_years = {
|
||||
"Harry Potter": "2001",
|
||||
"Star Wars": "1977",
|
||||
"Le Seigneur des Anneaux": "2001",
|
||||
"Batman": "1989",
|
||||
"Spider-Man": "2002",
|
||||
"Iron Man": "2008",
|
||||
"Les Avengers": "2012",
|
||||
"Doctor Strange": "2016",
|
||||
"Deadpool": "2016",
|
||||
"X-Men": "2000",
|
||||
"Black Panther": "2018",
|
||||
"Aquaman": "2018",
|
||||
"La Ligue des Justiciers": "2017",
|
||||
"Shazam": "2019",
|
||||
"Les Gardiens de la Galaxie": "2014",
|
||||
"Le Parrain": "1972",
|
||||
"Le Chevalier Noir": "2008",
|
||||
"Inception": "2010",
|
||||
"Fight Club": "1999",
|
||||
"Pulp Fiction": "1994",
|
||||
"Forrest Gump": "1994",
|
||||
"Matrix": "1999",
|
||||
"Jurassic Park": "1993",
|
||||
"Gladiator": "2000",
|
||||
"Le Silence des Agneaux": "1991",
|
||||
"La Liste de Schindler": "1993",
|
||||
"Braveheart": "1995",
|
||||
"La Ligne Verte": "1999",
|
||||
"Il faut sauver le soldat Ryan": "1998",
|
||||
"Thor": "2011",
|
||||
"Captain America": "2011",
|
||||
"Logan": "2017",
|
||||
"Joker": "2019",
|
||||
"Wonder Woman": "2017",
|
||||
"L'Homme d'Acier": "2013",
|
||||
"Hunger Games": "2012",
|
||||
"Divergente": "2014",
|
||||
"Les Animaux Fantastiques": "2016",
|
||||
"Le Hobbit": "2012",
|
||||
"Pirates des Caraïbes": "2003",
|
||||
"Toy Story": "1995",
|
||||
"La Reine des Neiges": "2013",
|
||||
"Le Roi Lion": "1994",
|
||||
"La Belle et la Bête": "1991",
|
||||
"Aladdin": "1992",
|
||||
"Mulan": "1998",
|
||||
"Cendrillon": "1950",
|
||||
"La Belle au bois dormant": "1959",
|
||||
"Blanche-Neige": "1937",
|
||||
"Raiponce": "2010",
|
||||
"Vaiana": "2016",
|
||||
"Zootopie": "2016",
|
||||
"Vice-versa": "2015",
|
||||
"Le Monde de Nemo": "2003",
|
||||
"Les Indestructibles": "2004",
|
||||
"Ratatouille": "2007",
|
||||
"WALL-E": "2008",
|
||||
"Là-haut": "2009",
|
||||
"Coco": "2017",
|
||||
"Monstres & Cie": "2001",
|
||||
"Cars": "2006",
|
||||
"Madagascar": "2005",
|
||||
"Shrek": "2001",
|
||||
"Kung Fu Panda": "2008",
|
||||
"Dragons": "2010",
|
||||
"L'Âge de glace": "2002",
|
||||
"Les Croods": "2013"
|
||||
}
|
||||
|
||||
# Fonction pour rechercher une image sur Bing
|
||||
def search_image_bing(personnage, film):
|
||||
url = f"https://www.bing.com/images/search?q={personnage.replace(' ', '+')}+{film.replace(' ', '+')}"
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
|
||||
|
||||
response = requests.get(url, headers=headers)
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
|
||||
# Rechercher la première image
|
||||
image = soup.find('img', {'class': 'mimg'})
|
||||
if image and 'src' in image.attrs:
|
||||
return image['src']
|
||||
return None
|
||||
|
||||
# Vérifier si une citation est en français
|
||||
common_french_words = {
|
||||
'le', 'la', 'les', 'et', 'est', 'pour', 'que', 'qui', 'un', 'une', 'de', 'du', 'ce', 'cela',
|
||||
'dans', 'sur', 'par', 'avec', 'en', 'au', 'aux', 'des', 'ou', 'mais', 'si', 'ne', 'pas',
|
||||
'il', 'elle', 'ils', 'elles', 'nous', 'vous', 'tu', 'je', 'me', 'te', 'se', 'sont', 'été',
|
||||
'avoir', 'être', 'faire', 'dire', 'pouvoir', 'aller', 'venir', 'voir', 'vouloir', 'savoir',
|
||||
'bien', 'tout', 'mon', 'ton', 'son', 'notre', 'votre', 'leur', 'plus', 'aussi', 'comme',
|
||||
'faut', 'a', 'le', 'la', 'les'
|
||||
}
|
||||
|
||||
def is_french(citation_text):
|
||||
words_in_citation = citation_text.lower().split()
|
||||
french_word_count = sum(1 for word in words_in_citation if word in common_french_words)
|
||||
|
||||
# On considère que c'est français si au moins 50% des mots sont reconnus comme français
|
||||
return french_word_count / len(words_in_citation) >= 0.5
|
||||
|
||||
# Fonction pour obtenir l'année du film depuis la base de données locale ou via Google
|
||||
def get_film_year(film_name):
|
||||
if film_name in film_years:
|
||||
return film_years[film_name]
|
||||
return '0' # Retourner 0 si l'année est inconnue
|
||||
|
||||
# Fonction pour scraper les citations d'un film
|
||||
def scrape_citations(film_name):
|
||||
url = "https://www.kaakook.fr/rechercher"
|
||||
data = {
|
||||
"extrfilm": film_name, # Nom du film
|
||||
"extrcitation": "" # On laisse vide pour chercher toutes les citations du film
|
||||
}
|
||||
|
||||
# Faire la requête POST
|
||||
response = requests.post(url, data=data)
|
||||
if response.status_code != 200:
|
||||
print(f"Erreur avec le film {film_name}. Status code: {response.status_code}")
|
||||
return []
|
||||
|
||||
# Parser la réponse HTML
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
citations = []
|
||||
|
||||
# Extraire les citations et autres informations
|
||||
articles = soup.find_all('article')
|
||||
for article in articles:
|
||||
citation_text = article.find('a').get_text().strip().replace("\n", " ").replace(" ", " ")
|
||||
|
||||
# Ignorer les citations contenant du HTML <br> (donc multi-lignes)
|
||||
if "<br>" in str(article):
|
||||
continue
|
||||
|
||||
# Vérifier si la citation commence par un tiret (dialogue)
|
||||
if citation_text.startswith('-'):
|
||||
continue
|
||||
|
||||
# Vérifier si la citation est en français
|
||||
if not is_french(citation_text):
|
||||
continue
|
||||
|
||||
source = article.find('cite').get_text().strip()
|
||||
|
||||
# Récupérer le personnage (ignorer si le personnage est inconnu)
|
||||
footer_links = article.find('footer').find_all('a')
|
||||
character = footer_links[1].get_text().strip() if len(footer_links) > 1 else 'Inconnu'
|
||||
|
||||
# Si le personnage est inconnu, on ignore cette citation
|
||||
if character == 'Inconnu':
|
||||
continue
|
||||
|
||||
# Récupérer l'image associée
|
||||
img_tag = article.find('img')
|
||||
image_url = img_tag['src'] if img_tag else 'images/default.jpg'
|
||||
|
||||
|
||||
# Télécharger l'image si elle n'existe pas déjà
|
||||
if image_url == 'images/default.jpg':
|
||||
image_url = search_image_bing(character, film_name)
|
||||
if image_url:
|
||||
download_image(image_url, character)
|
||||
|
||||
# Récupérer l'année via la base de données locale ou Google
|
||||
year = get_film_year(film_name)
|
||||
|
||||
# Stocker les résultats sous la forme "citation; source; character; year; image"
|
||||
citations.append(f"\n{citation_text}; {source}; {character}; {year}; {image_url}")
|
||||
|
||||
return citations
|
||||
|
||||
# Fonction pour télécharger une image
|
||||
def download_image(image_url, character_name):
|
||||
image_name = f"{character_name.replace(' ', '_')}.jpg"
|
||||
image_path = os.path.join("images", image_name)
|
||||
|
||||
# Vérifier si le dossier "images" existe, sinon le créer
|
||||
if not os.path.exists("images"):
|
||||
os.makedirs("images")
|
||||
|
||||
# Si l'image n'existe pas déjà, la télécharger
|
||||
if not os.path.exists(image_path):
|
||||
try:
|
||||
img_data = requests.get(image_url).content
|
||||
with open(image_path, 'wb') as handler:
|
||||
handler.write(img_data)
|
||||
print(f"Image téléchargée : {image_path}")
|
||||
except Exception as e:
|
||||
print(f"Erreur lors du téléchargement de l'image {image_url} : {e}")
|
||||
|
||||
# Ecrire les résultats dans un fichier txt unique
|
||||
def save_citations_to_file(citations):
|
||||
with open("citation.txt", "a", encoding="utf-8") as file:
|
||||
for citation in citations:
|
||||
file.write(citation + "\n")
|
||||
|
||||
# Itérer sur la liste des films et récupérer les citations
|
||||
for film in films:
|
||||
print(f"Recherche des citations pour le film : {film}")
|
||||
citations = scrape_citations(film)
|
||||
if citations:
|
||||
save_citations_to_file(citations)
|
||||
print(f"Ajouté {len(citations)} citation(s) pour {film}.")
|
||||
else:
|
||||
print(f"Aucune citation trouvée pour {film}.")
|
||||
time.sleep(2) # Petite pause pour éviter d'envoyer trop de requêtes rapidement
|
Loading…
Reference in new issue