|
|
@ -11,39 +11,23 @@ from tqdm import tqdm
|
|
|
|
### -------------------------------------
|
|
|
|
### -------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
## GET <li> ELEMENTS NAME + URL TO DETAIL PAGE
|
|
|
|
## GET <li> ELEMENTS NAME + URL TO DETAIL PAGE
|
|
|
|
# url with all spells
|
|
|
|
# set url for page with all spells
|
|
|
|
URL = "https://www.d20pfsrd.com/magic/all-spells/"
|
|
|
|
URL = "https://www.d20pfsrd.com/magic/all-spells/"
|
|
|
|
|
|
|
|
|
|
|
|
# get the page content using GET to url
|
|
|
|
# get the page content
|
|
|
|
response = requests.get(URL)
|
|
|
|
response = requests.get(URL)
|
|
|
|
|
|
|
|
|
|
|
|
# parse html using
|
|
|
|
# parse html using beautifulSoup
|
|
|
|
soup = BeautifulSoup(response.content, 'lxml')
|
|
|
|
soup = BeautifulSoup(response.content, 'lxml')
|
|
|
|
list = soup.find(id='article-content').find_next('div',class_="flexbox")
|
|
|
|
list = soup.find(id='article-content').find_next('div',class_="flexbox")
|
|
|
|
|
|
|
|
|
|
|
|
# this gets all the <li> elements from the article-content div, which contain all of the
|
|
|
|
# this gets all the <li> elements from the article-content div, which contains all of the
|
|
|
|
# spells (name and link to detail page)
|
|
|
|
# spells (name and link to detail page)
|
|
|
|
lis = list.find_all('li')
|
|
|
|
lis = list.find_all('li')
|
|
|
|
|
|
|
|
|
|
|
|
###################
|
|
|
|
###################
|
|
|
|
### METHODS
|
|
|
|
### METHODS
|
|
|
|
###################
|
|
|
|
###################
|
|
|
|
def parseLevelAndGetClass(spell_level):
|
|
|
|
|
|
|
|
class_dict = {}
|
|
|
|
|
|
|
|
for class_level in spell_level.split(","):
|
|
|
|
|
|
|
|
class_level = class_level.strip()
|
|
|
|
|
|
|
|
if " " in class_level:
|
|
|
|
|
|
|
|
class_name, level = class_level.rsplit(maxsplit=1)
|
|
|
|
|
|
|
|
if "/" in class_name:
|
|
|
|
|
|
|
|
class_names = class_name.split("/")
|
|
|
|
|
|
|
|
for name in class_names:
|
|
|
|
|
|
|
|
class_dict[name.strip()] = level.strip()
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
class_dict[class_name.strip()] = level.strip()
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
class_dict[class_level.strip()] = "1"
|
|
|
|
|
|
|
|
return class_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def getStringSiblings(array, content, stop):
|
|
|
|
def getStringSiblings(array, content, stop):
|
|
|
|
if content:
|
|
|
|
if content:
|
|
|
|
for sibling in content.next_siblings:
|
|
|
|
for sibling in content.next_siblings:
|
|
|
@ -62,14 +46,14 @@ def getStringSiblings(array, content, stop):
|
|
|
|
spellz = {}
|
|
|
|
spellz = {}
|
|
|
|
pbar = tqdm(total=2650, desc="[Processing]", unit=" spell")
|
|
|
|
pbar = tqdm(total=2650, desc="[Processing]", unit=" spell")
|
|
|
|
|
|
|
|
|
|
|
|
for li in lis:
|
|
|
|
for li in lis: # now we loop over all spells, get the page link and scrap all attributes
|
|
|
|
url = li.a['href']
|
|
|
|
url = li.a['href']
|
|
|
|
|
|
|
|
|
|
|
|
## get html of details page
|
|
|
|
## get html of details page
|
|
|
|
responseDetails = requests.get(url)
|
|
|
|
responseDetails = requests.get(url)
|
|
|
|
spellSoup = BeautifulSoup(responseDetails.content, 'lxml')
|
|
|
|
spellSoup = BeautifulSoup(responseDetails.content, 'lxml')
|
|
|
|
|
|
|
|
|
|
|
|
# get article content which contains all info about spells
|
|
|
|
# get article content which contains all info about the spell
|
|
|
|
spellContent = spellSoup.find(id='article-content')
|
|
|
|
spellContent = spellSoup.find(id='article-content')
|
|
|
|
pbar.update(1)
|
|
|
|
pbar.update(1)
|
|
|
|
|
|
|
|
|
|
|
@ -92,8 +76,6 @@ for li in lis:
|
|
|
|
spell_school = parts[0].replace("School","").strip().strip(";")
|
|
|
|
spell_school = parts[0].replace("School","").strip().strip(";")
|
|
|
|
spell_level = parts[1].replace("Level","").strip().split(";")[0]
|
|
|
|
spell_level = parts[1].replace("Level","").strip().split(";")[0]
|
|
|
|
|
|
|
|
|
|
|
|
spell_class_and_level = parseLevelAndGetClass(spell_level)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# print("School: ",spell_school)
|
|
|
|
# print("School: ",spell_school)
|
|
|
|
# print("Level:",spell_level)
|
|
|
|
# print("Level:",spell_level)
|
|
|
|
|
|
|
|
|
|
|
@ -171,16 +153,10 @@ for li in lis:
|
|
|
|
if spell_description and spell_description.parent.name == 'div':
|
|
|
|
if spell_description and spell_description.parent.name == 'div':
|
|
|
|
break
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# add all attributes to a spell dictionnary
|
|
|
|
# print("Spell description:\n", '\n\n'.join(spell_paragraphs))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# print(" ----- ")
|
|
|
|
|
|
|
|
# print(" ")
|
|
|
|
|
|
|
|
# cpt += 1
|
|
|
|
|
|
|
|
# print("no: ",cpt)
|
|
|
|
|
|
|
|
spellz[spell_name] = {
|
|
|
|
spellz[spell_name] = {
|
|
|
|
'school': spell_school,
|
|
|
|
'school': spell_school,
|
|
|
|
'level': spell_class_and_level,
|
|
|
|
'level': spell_level,
|
|
|
|
'casting_time': spell_castTime,
|
|
|
|
'casting_time': spell_castTime,
|
|
|
|
'components': spell_components,
|
|
|
|
'components': spell_components,
|
|
|
|
'range': spell_range,
|
|
|
|
'range': spell_range,
|
|
|
@ -196,7 +172,7 @@ for li in lis:
|
|
|
|
with open('outputs/spells.yaml', 'w') as f:
|
|
|
|
with open('outputs/spells.yaml', 'w') as f:
|
|
|
|
yaml.dump(spellz, f)
|
|
|
|
yaml.dump(spellz, f)
|
|
|
|
|
|
|
|
|
|
|
|
pbar.close()
|
|
|
|
pbar.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|