added progress bar and minor README changes

main
Nicolas FRANCO 2 years ago
parent d43d67c8b7
commit 2b6f7a8783

@ -35,16 +35,17 @@ pip install requests beautifulsoup4 lxml pyyaml
### Scrapping ### Scrapping
3. You can run __scrap-spells.py__ to scrape the spell information from the website: 3. You can run __scrap-spells.py__ to scrape the spell information from the website:
``` ```
python3 scrap-spells.py python scrapping/scrap-spells.py
``` ```
__The script should take a few minutes to scrap all spells__ A progress bar should be displayed in your terminal indicating the time left and the number of spells scraped.
__The script should takes about 20 minutes to scrap all spells__
4. This command will generate a file __spells.yaml__ with all spells and their attributes 4. This command will generate a file __spells.yaml__ with all spells and their attributes
### Database ### Database
5. You can build a __.db__ sqlite3 databse file by running the __spell_db.py__ file: 5. You can build a __.db__ sqlite3 databse file by running the __spell_db.py__ file:
``` ```
python3 spell-db.py python database/spell-db.py
``` ```
6. The script will create a __spells.db__ file with a spell table containing all the spell information. 6. The script will create a __spells.db__ file with a spell table containing all the spell information.

@ -4,6 +4,7 @@ import bs4
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from lxml import html from lxml import html
import yaml import yaml
from tqdm import tqdm
### ------------------------------------- ### -------------------------------------
# GET ALL THE SPELLS FROM THE PAGE # GET ALL THE SPELLS FROM THE PAGE
@ -42,9 +43,8 @@ def getStringSiblings(array, content, stop):
return None return None
return ' '.join(array) return ' '.join(array)
cpt = 0
spellz = {} spellz = {}
pbar = tqdm(total=2650, desc="[Processing]", unit=" spell")
for li in lis: for li in lis:
url = li.a['href'] url = li.a['href']
@ -55,6 +55,7 @@ for li in lis:
# get article content which contains all info about spells # get article content which contains all info about spells
spellContent = spellSoup.find(id='article-content') spellContent = spellSoup.find(id='article-content')
pbar.update(1)
################### ###################
### ATTRIBUTES ### ATTRIBUTES
@ -66,7 +67,7 @@ for li in lis:
else : else :
spell_name = None spell_name = None
continue continue
print("name: ",spell_name) # print("name: ",spell_name)
# get school and level # get school and level
school_levels = spellContent.find('b',string="School").find_previous('p') school_levels = spellContent.find('b',string="School").find_previous('p')
@ -75,63 +76,63 @@ for li in lis:
spell_school = parts[0].replace("School","").strip().strip(";") spell_school = parts[0].replace("School","").strip().strip(";")
spell_level = parts[1].replace("Level","").strip() spell_level = parts[1].replace("Level","").strip()
print("School: ",spell_school) # print("School: ",spell_school)
print("Level:",spell_level) # print("Level:",spell_level)
# get casting time # get casting time
castTime = [] castTime = []
spell_castTime = spellContent.find('b',string="Casting Time") spell_castTime = spellContent.find('b',string="Casting Time")
spell_castTime = getStringSiblings(castTime, spell_castTime, 'b') spell_castTime = getStringSiblings(castTime, spell_castTime, 'b')
print("Cast time: ", spell_castTime) # print("Cast time: ", spell_castTime)
# get components # get components
components = [] components = []
spell_components = spellContent.find('b', string='Components')#.next_sibling.strip() spell_components = spellContent.find('b', string='Components')#.next_sibling.strip()
spell_components = getStringSiblings(components, spell_components, 'p') spell_components = getStringSiblings(components, spell_components, 'p')
print ("Components: ", spell_components) # print ("Components: ", spell_components)
# get range # get range
rangesp = [] rangesp = []
spell_range = spellContent.find('b',string="Range") spell_range = spellContent.find('b',string="Range")
spell_range = getStringSiblings(rangesp, spell_range, 'b') spell_range = getStringSiblings(rangesp, spell_range, 'b')
print("Range: ", spell_range) # print("Range: ", spell_range)
# get target # get target
target = [] target = []
spell_target = spellContent.find('b',string="Target") spell_target = spellContent.find('b',string="Target")
spell_target = getStringSiblings(target, spell_target, 'b') spell_target = getStringSiblings(target, spell_target, 'b')
print("Target: ", spell_target) # print("Target: ", spell_target)
#get duration #get duration
duration = [] duration = []
spell_duration = spellContent.find('b',string="Duration") spell_duration = spellContent.find('b',string="Duration")
spell_duration = getStringSiblings(duration, spell_duration, 'b') spell_duration = getStringSiblings(duration, spell_duration, 'b')
print("Duration: ",spell_duration) # print("Duration: ",spell_duration)
# get saving throw # get saving throw
svthrow = [] svthrow = []
spell_saving_throw = spellContent.find('b',string='Saving Throw') spell_saving_throw = spellContent.find('b',string='Saving Throw')
spell_saving_throw = getStringSiblings(svthrow, spell_saving_throw, 'b') spell_saving_throw = getStringSiblings(svthrow, spell_saving_throw, 'b')
print("Saving throw: ", spell_saving_throw) # print("Saving throw: ", spell_saving_throw)
# get resistance # get resistance
resistance = [] resistance = []
spell_resistance = spellContent.find('b',string='Spell Resistance') spell_resistance = spellContent.find('b',string='Spell Resistance')
spell_resistance = getStringSiblings(resistance, spell_resistance, 'b') spell_resistance = getStringSiblings(resistance, spell_resistance, 'b')
print("Spell Resistance: ", spell_resistance) # print("Spell Resistance: ", spell_resistance)
# get area # get area
area = [] area = []
spell_area = spellContent.find('b',string='Area') spell_area = spellContent.find('b',string='Area')
spell_area = getStringSiblings(area, spell_area, 'b') spell_area = getStringSiblings(area, spell_area, 'b')
print("Area:", spell_area ) # print("Area:", spell_area )
# get effect # get effect
effect = [] effect = []
spell_effect = spellContent.find('b',string='Effect') spell_effect = spellContent.find('b',string='Effect')
spell_effect = getStringSiblings(effect, spell_effect, 'b') spell_effect = getStringSiblings(effect, spell_effect, 'b')
print('Effect: ',spell_effect) # print('Effect: ',spell_effect)
# get description # get description
spell_paragraphs = [] spell_paragraphs = []
@ -153,7 +154,7 @@ for li in lis:
break break
print("Spell description:\n", '\n\n'.join(spell_paragraphs)) # print("Spell description:\n", '\n\n'.join(spell_paragraphs))
# print(" ----- ") # print(" ----- ")
# print(" ") # print(" ")
@ -177,6 +178,7 @@ for li in lis:
with open('outputs/spells.yaml', 'w') as f: with open('outputs/spells.yaml', 'w') as f:
yaml.dump(spellz, f) yaml.dump(spellz, f)
pbar.close()

Loading…
Cancel
Save