rework description alg ♻️

description
Nicolas FRANCO 2 years ago
parent 003bd32c8f
commit 3bde5a9363

File diff suppressed because it is too large Load Diff

@ -9,62 +9,76 @@ import bs4
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from lxml import html from lxml import html
URL = "https://www.d20pfsrd.com/magic/all-spells/a/ant-haul/" # URL = "https://www.d20pfsrd.com/magic/all-spells/m/magic-circle-against-evil/"
# URL = "https://www.d20pfsrd.com/magic/all-spells/p/prophetic-lore/"
URL = "https://www.d20pfsrd.com/magic/all-spells/t/time-stop/"
responseDetails = requests.get(URL) responseDetails = requests.get(URL)
spellSoup = BeautifulSoup(responseDetails.content, 'lxml') spellSoup = BeautifulSoup(responseDetails.content, 'lxml')
spellContent = spellSoup.find(id='article-content') spellContent = spellSoup.find(id='article-content')
description = spellContent.find('p', string='DESCRIPTION')
next_elem = description.find_next_sibling()
html = ''
while next_elem and not (next_elem.name == 'div' and 'section15' in next_elem.get('class', [])):
html += str(next_elem)
next_elem = next_elem.find_next_sibling()
# Convert HTML string to regular string
htmlString = str(html)
if "<h4" in htmlString:
# Trim HTML string
htmlString = htmlString.split("<h4")[0]
if "</p><div>" in htmlString:
last_p_index = htmlString.rfind("</p>")
if last_p_index != -1:
htmlString = htmlString[:last_p_index + 4]
# Convert back to BeautifulSoup object
trimmed_html = BeautifulSoup(htmlString, 'html.parser')
str(trimmed_html)
trimmed_html.prettify()
print(trimmed_html)
################### ###################
### LEVELS ### LEVELS
################### ###################
school_levels = spellContent.find('b',string="School").find_previous('p') # school_levels = spellContent.find('b',string="School").find_previous('p')
text = school_levels.text # text = school_levels.text
parts = text.split("Level") # parts = text.split("Level")
spell_school = parts[0].replace("School","").strip().strip(";") # spell_school = parts[0].replace("School","").strip().strip(";")
spell_level = parts[1].replace("Level","").strip().split(";")[0] # spell_level = parts[1].replace("Level","").strip().split(";")[0]
print("level: ", spell_level) # print("level: ", spell_level)
print("school: ", spell_school) # print("school: ", spell_school)
################### ###################
### DESCRIPTION ### DESCRIPTION
################### ###################
# spell_description = spellContent.find('p',string='DESCRIPTION')
# print("Desc separator: ", spell_description)
# spell_paragraphs = [] # spell_paragraphs = []
# spell_description = spellContent.find('p',string='DESCRIPTION') # spell_description = spellContent.find('p',string='DESCRIPTION')
# if not spell_description: # if not spell_description:
# spell_description = spellSoup.find('div', {'class': 'page-center'}).find('p',string='DESCRIPTION') # spell_description = spellSoup.find('div', {'class': 'page-center'}).find('p',string='DESCRIPTION')
# spell_description = spell_description.find_next('p') # if not spell_description:
# spell_description = None
# exit # change to continue
# spell_description = spell_description.find_next()
# # check if spell description is a table
# if spell_description.name == 'table':
# spell_paragraphs.append(spell_description)
# spell_description = spell_description.find_next()
# while spell_description and not spell_description.find_previous('div', {'class': 'section15'}): # print("paragraphs: ", spell_paragraphs)
# if spell_description.has_attr('class'): # print("description: ", spell_description)
# spell_paragraphs.append(spell_description.text) # Find the parent tag of the <p> tag with text "DESCRIPTION"
# else:
# spell_paragraphs.append(spell_description.text)
# spell_description = spell_description.find_next('p')
# if spell_description and spell_description.parent.name == 'div':
# break
# print("Spell description:\n", '\n\n'.join(spell_paragraphs))
# def getStringSiblings(array, content, stop): # prettify
# if content:
# for sibling in content.next_siblings:
# print(sibling)
# if sibling.name == stop:
# break
# if sibling.name == 'a':
# array.append(sibling.text)
# elif isinstance((sibling), bs4.element.NavigableString):
# component_text = sibling.string.strip()
# if component_text:
# array.append(component_text.rstrip(';'))
# else:
# return None
# return ' '.join(array)
################### ###################
### TARGET ### TARGET

@ -44,7 +44,8 @@ def getStringSiblings(array, content, stop):
return ' '.join(array) return ' '.join(array)
spellz = {} spellz = {}
pbar = tqdm(total=2650, desc="[Processing]", unit=" spell") # pbar = tqdm(total=2650, desc="[Processing]", unit=" spell")
cpt = 0
for li in lis: # now we loop over all spells, get the page link and scrap all attributes for li in lis: # now we loop over all spells, get the page link and scrap all attributes
url = li.a['href'] url = li.a['href']
@ -55,7 +56,7 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at
# get article content which contains all info about the spell # get article content which contains all info about the spell
spellContent = spellSoup.find(id='article-content') spellContent = spellSoup.find(id='article-content')
pbar.update(1) # pbar.update(1)
################### ###################
### ATTRIBUTES ### ATTRIBUTES
@ -67,7 +68,7 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at
else : else :
spell_name = None spell_name = None
continue continue
# print("name: ",spell_name) print("name: ",spell_name)
# get school and level # get school and level
school_levels = spellContent.find('b',string="School").find_previous('p') school_levels = spellContent.find('b',string="School").find_previous('p')
@ -135,23 +136,29 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at
# print('Effect: ',spell_effect) # print('Effect: ',spell_effect)
# get description # get description
spell_paragraphs = [] description = spellContent.find('p', string='DESCRIPTION')
spell_description = spellContent.find('p',string='DESCRIPTION') if(description):
if not spell_description: next_elem = description.find_next_sibling()
spell_description = spellSoup.find('div', {'class': 'page-center'}).find('p',string='DESCRIPTION') html = ''
if not spell_description: while next_elem and not (next_elem.name == 'div' and 'section15' in next_elem.get('class', [])):
spell_description = None html += str(next_elem)
continue next_elem = next_elem.find_next_sibling()
spell_description = spell_description.find_next('p')
while spell_description and not spell_description.find_previous('div', {'class': 'section15'}): # Convert HTML string to regular string
if spell_description.has_attr('class'): htmlString = str(html)
spell_paragraphs.append(spell_description.text)
else: if "<h4" in htmlString:
spell_paragraphs.append(spell_description.text) # Trim HTML string
spell_description = spell_description.find_next('p') htmlString = htmlString.split("<h4")[0]
if spell_description and spell_description.parent.name == 'div':
break if "</p><div>" in htmlString:
last_p_index = htmlString.rfind("</p>")
if last_p_index != -1:
htmlString = htmlString[:last_p_index + 4]
# Convert back to BeautifulSoup object
spell_paragraphs = htmlString
# add all attributes to a spell dictionnary # add all attributes to a spell dictionnary
spellz[spell_name] = { spellz[spell_name] = {
@ -172,7 +179,7 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at
with open('outputs/spells.yaml', 'w') as f: with open('outputs/spells.yaml', 'w') as f:
yaml.dump(spellz, f) yaml.dump(spellz, f)
pbar.close() # pbar.close()

Loading…
Cancel
Save