Compare commits

..

2 Commits

@ -1,12 +1,12 @@
# Spell Scrapper :scroll: :snake:
## About this repository
This repository was built in order to have an "up to date" spells database for [SpellTastic](https://codefirst.iut.uca.fr/git/Spelltastic/Spelltastic), a cross-platform spell manager for Patfinder.
This repository was built in order to have an "up to date" spells database for [SpellTastic](https://codefirst.iut.uca.fr/git/Spelltastic/Spelltastic), a cross-platform spell manager for Pathfinder.
## Data source
All data is retrieved from [d20pfsrd](https://www.d20pfsrd.com/) the __#1 Pathfinder Roleplaying Game rules reference site__. All spells can be found at [spells](https://www.d20pfsrd.com/magic/all-spells/).
The latest data extracted is available as a YAML file.
The latest data extracted is available as a YAML file and can be found in the `outputs` directory.
## Getting Started
@ -37,10 +37,10 @@ pip install requests beautifulsoup4 lxml pyyaml
```
python scrapping/scrap-spells.py
```
A progress bar should be displayed in your terminal indicating the time left and the number of spells scraped.
__The script should takes about 20 minutes to scrap all spells__
4. This command will generate a file __spells.yaml__ with all spells and their attributes
4. This command will generate a file __spells.yaml__ with all spells and their attributes. The file should be found in the `outputs` directory.
_A progress bar should be displayed in your terminal while scrapping, showing the time left and the number of spells scraped. The script should takes about 20 minutes to scrap all spells._
### Database
5. You can build a __.db__ sqlite3 databse file by running the __spell_db.py__ file:
@ -48,4 +48,4 @@ __The script should takes about 20 minutes to scrap all spells__
python database/spell-db.py
```
6. The script will create a __spells.db__ file with a spell table containing all the spell information.
6. The script will generate a __spells.db__ file with a spell table containing all the spell information. This file should also be found in the `outputs` directory.

@ -41,7 +41,7 @@ def insertSpells():
duration = spell.get('duration')
saving_throw = spell.get('saving_throw')
spell_resistance = spell.get('spell_resistance')
description = spell.get('description')
description = '\n'.join(spell.get('description', []))
components = spell.get('components')
area = spell.get('area')
effect = spell.get('effect')

Binary file not shown.

File diff suppressed because it is too large Load Diff

@ -9,80 +9,62 @@ import bs4
from bs4 import BeautifulSoup
from lxml import html
# URL = "https://www.d20pfsrd.com/magic/all-spells/m/magic-circle-against-evil/"
# URL = "https://www.d20pfsrd.com/magic/all-spells/p/prophetic-lore/"
URL = "https://www.d20pfsrd.com/magic/all-spells/d/death-from-below/"
URL = "https://www.d20pfsrd.com/magic/all-spells/a/ant-haul/"
responseDetails = requests.get(URL)
spellSoup = BeautifulSoup(responseDetails.content, 'lxml')
spellContent = spellSoup.find(id='article-content')
description = spellContent.find('p', string='DESCRIPTION')
if description is None:
description = spellContent.find_next_sibling()
next_elem = description.find_next('p')
html= ''
while next_elem and not (next_elem.name == 'div' and 'section15' in next_elem.get('class', [])):
print("ne;", next_elem )
print()
html += str(next_elem)
next_elem = next_elem.find_next_sibling()
# Convert HTML string to regular string
htmlString = str(html)
if "<h4" in htmlString:
# Trim HTML string
htmlString = htmlString.split("<h4")[0]
if "</p><div>" in htmlString:
last_p_index = htmlString.rfind("</p>")
if last_p_index != -1:
htmlString = htmlString[:last_p_index + 4]
# Convert back to BeautifulSoup object
trimmed_html = BeautifulSoup(htmlString, 'html.parser')
str(trimmed_html)
trimmed_html.prettify()
print(trimmed_html)
###################
### LEVELS
###################
# school_levels = spellContent.find('b',string="School").find_previous('p')
# text = school_levels.text
# parts = text.split("Level")
# spell_school = parts[0].replace("School","").strip().strip(";")
# spell_level = parts[1].replace("Level","").strip().split(";")[0]
school_levels = spellContent.find('b',string="School").find_previous('p')
text = school_levels.text
parts = text.split("Level")
spell_school = parts[0].replace("School","").strip().strip(";")
spell_level = parts[1].replace("Level","").strip().split(";")[0]
# print("level: ", spell_level)
# print("school: ", spell_school)
print("level: ", spell_level)
print("school: ", spell_school)
###################
### DESCRIPTION
###################
# spell_description = spellContent.find('p',string='DESCRIPTION')
# print("Desc separator: ", spell_description)
# spell_paragraphs = []
# spell_description = spellContent.find('p',string='DESCRIPTION')
# if not spell_description:
# spell_description = spellSoup.find('div', {'class': 'page-center'}).find('p',string='DESCRIPTION')
# if not spell_description:
# spell_description = None
# exit # change to continue
# spell_description = spell_description.find_next()
# # check if spell description is a table
# if spell_description.name == 'table':
# spell_paragraphs.append(spell_description)
# spell_description = spell_description.find_next()
# print("paragraphs: ", spell_paragraphs)
# print("description: ", spell_description)
# Find the parent tag of the <p> tag with text "DESCRIPTION"
# spell_description = spell_description.find_next('p')
# while spell_description and not spell_description.find_previous('div', {'class': 'section15'}):
# if spell_description.has_attr('class'):
# spell_paragraphs.append(spell_description.text)
# else:
# spell_paragraphs.append(spell_description.text)
# spell_description = spell_description.find_next('p')
# if spell_description and spell_description.parent.name == 'div':
# break
# prettify
# print("Spell description:\n", '\n\n'.join(spell_paragraphs))
# def getStringSiblings(array, content, stop):
# if content:
# for sibling in content.next_siblings:
# print(sibling)
# if sibling.name == stop:
# break
# if sibling.name == 'a':
# array.append(sibling.text)
# elif isinstance((sibling), bs4.element.NavigableString):
# component_text = sibling.string.strip()
# if component_text:
# array.append(component_text.rstrip(';'))
# else:
# return None
# return ' '.join(array)
###################
### TARGET

@ -55,7 +55,7 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at
# get article content which contains all info about the spell
spellContent = spellSoup.find(id='article-content')
# pbar.update(1)
pbar.update(1)
###################
### ATTRIBUTES
@ -67,7 +67,7 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at
else :
spell_name = None
continue
print("name: ",spell_name)
# print("name: ",spell_name)
# get school and level
school_levels = spellContent.find('b',string="School").find_previous('p')
@ -135,32 +135,23 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at
# print('Effect: ',spell_effect)
# get description
description = spellContent.find('p', string='DESCRIPTION')
if description is None:
description = spellContent.find_next_sibling()
else:
next_elem = description.find_next_sibling()
html = ''
while next_elem and not (next_elem.name == 'div' and 'section15' in next_elem.get('class', [])):
html += str(next_elem)
next_elem = next_elem.find_next_sibling()
# Convert HTML string to regular string
htmlString = str(html)
if "<h4" in htmlString:
# Trim HTML string
htmlString = htmlString.split("<h4")[0]
if "</p><div>" in htmlString:
last_p_index = htmlString.rfind("</p>")
if last_p_index != -1:
htmlString = htmlString[:last_p_index + 4]
spell_paragraphs = []
spell_description = spellContent.find('p',string='DESCRIPTION')
if not spell_description:
spell_description = spellSoup.find('div', {'class': 'page-center'}).find('p',string='DESCRIPTION')
if not spell_description:
spell_description = None
continue
spell_description = spell_description.find_next('p')
# Convert back to BeautifulSoup object
spell_paragraphs = htmlString
while spell_description and not spell_description.find_previous('div', {'class': 'section15'}):
if spell_description.has_attr('class'):
spell_paragraphs.append(spell_description.text)
else:
spell_paragraphs.append(spell_description.text)
spell_description = spell_description.find_next('p')
if spell_description and spell_description.parent.name == 'div':
break
# add all attributes to a spell dictionnary
spellz[spell_name] = {

Loading…
Cancel
Save