Compare commits

..

3 Commits

@ -1,12 +1,12 @@
# Spell Scrapper :scroll: :snake:
## About this repository
This repository was built in order to have an "up to date" spells database for [SpellTastic](https://codefirst.iut.uca.fr/git/Spelltastic/Spelltastic), a cross-platform spell manager for Pathfinder.
This repository was built in order to have an "up to date" spells database for [SpellTastic](https://codefirst.iut.uca.fr/git/Spelltastic/Spelltastic), a cross-platform spell manager for Patfinder.
## Data source
All data is retrieved from [d20pfsrd](https://www.d20pfsrd.com/) the __#1 Pathfinder Roleplaying Game rules reference site__. All spells can be found at [spells](https://www.d20pfsrd.com/magic/all-spells/).
The latest data extracted is available as a YAML file and can be found in the `outputs` directory.
The latest data extracted is available as a YAML file.
## Getting Started
@ -37,10 +37,10 @@ pip install requests beautifulsoup4 lxml pyyaml
```
python scrapping/scrap-spells.py
```
A progress bar should be displayed in your terminal indicating the time left and the number of spells scraped.
__The script should takes about 20 minutes to scrap all spells__
4. This command will generate a file __spells.yaml__ with all spells and their attributes. The file should be found in the `outputs` directory.
_A progress bar should be displayed in your terminal while scrapping, showing the time left and the number of spells scraped. The script should takes about 20 minutes to scrap all spells._
4. This command will generate a file __spells.yaml__ with all spells and their attributes
### Database
5. You can build a __.db__ sqlite3 databse file by running the __spell_db.py__ file:
@ -48,4 +48,4 @@ _A progress bar should be displayed in your terminal while scrapping, showing th
python database/spell-db.py
```
6. The script will generate a __spells.db__ file with a spell table containing all the spell information. This file should also be found in the `outputs` directory.
6. The script will create a __spells.db__ file with a spell table containing all the spell information.

@ -41,7 +41,7 @@ def insertSpells():
duration = spell.get('duration')
saving_throw = spell.get('saving_throw')
spell_resistance = spell.get('spell_resistance')
description = '\n'.join(spell.get('description', []))
description = spell.get('description')
components = spell.get('components')
area = spell.get('area')
effect = spell.get('effect')

Binary file not shown.

File diff suppressed because it is too large Load Diff

@ -9,62 +9,80 @@ import bs4
from bs4 import BeautifulSoup
from lxml import html
URL = "https://www.d20pfsrd.com/magic/all-spells/a/ant-haul/"
# URL = "https://www.d20pfsrd.com/magic/all-spells/m/magic-circle-against-evil/"
# URL = "https://www.d20pfsrd.com/magic/all-spells/p/prophetic-lore/"
URL = "https://www.d20pfsrd.com/magic/all-spells/d/death-from-below/"
responseDetails = requests.get(URL)
spellSoup = BeautifulSoup(responseDetails.content, 'lxml')
spellContent = spellSoup.find(id='article-content')
description = spellContent.find('p', string='DESCRIPTION')
if description is None:
description = spellContent.find_next_sibling()
next_elem = description.find_next('p')
html= ''
while next_elem and not (next_elem.name == 'div' and 'section15' in next_elem.get('class', [])):
print("ne;", next_elem )
print()
html += str(next_elem)
next_elem = next_elem.find_next_sibling()
# Convert HTML string to regular string
htmlString = str(html)
if "<h4" in htmlString:
# Trim HTML string
htmlString = htmlString.split("<h4")[0]
if "</p><div>" in htmlString:
last_p_index = htmlString.rfind("</p>")
if last_p_index != -1:
htmlString = htmlString[:last_p_index + 4]
# Convert back to BeautifulSoup object
trimmed_html = BeautifulSoup(htmlString, 'html.parser')
str(trimmed_html)
trimmed_html.prettify()
print(trimmed_html)
###################
### LEVELS
###################
school_levels = spellContent.find('b',string="School").find_previous('p')
text = school_levels.text
parts = text.split("Level")
spell_school = parts[0].replace("School","").strip().strip(";")
spell_level = parts[1].replace("Level","").strip().split(";")[0]
# school_levels = spellContent.find('b',string="School").find_previous('p')
# text = school_levels.text
# parts = text.split("Level")
# spell_school = parts[0].replace("School","").strip().strip(";")
# spell_level = parts[1].replace("Level","").strip().split(";")[0]
print("level: ", spell_level)
print("school: ", spell_school)
# print("level: ", spell_level)
# print("school: ", spell_school)
###################
### DESCRIPTION
###################
# spell_description = spellContent.find('p',string='DESCRIPTION')
# print("Desc separator: ", spell_description)
# spell_paragraphs = []
# spell_description = spellContent.find('p',string='DESCRIPTION')
# if not spell_description:
# spell_description = spellSoup.find('div', {'class': 'page-center'}).find('p',string='DESCRIPTION')
# spell_description = spell_description.find_next('p')
# if not spell_description:
# spell_description = None
# exit # change to continue
# spell_description = spell_description.find_next()
# # check if spell description is a table
# if spell_description.name == 'table':
# spell_paragraphs.append(spell_description)
# spell_description = spell_description.find_next()
# while spell_description and not spell_description.find_previous('div', {'class': 'section15'}):
# if spell_description.has_attr('class'):
# spell_paragraphs.append(spell_description.text)
# else:
# spell_paragraphs.append(spell_description.text)
# spell_description = spell_description.find_next('p')
# if spell_description and spell_description.parent.name == 'div':
# break
# print("paragraphs: ", spell_paragraphs)
# print("description: ", spell_description)
# Find the parent tag of the <p> tag with text "DESCRIPTION"
# print("Spell description:\n", '\n\n'.join(spell_paragraphs))
# def getStringSiblings(array, content, stop):
# if content:
# for sibling in content.next_siblings:
# print(sibling)
# if sibling.name == stop:
# break
# if sibling.name == 'a':
# array.append(sibling.text)
# elif isinstance((sibling), bs4.element.NavigableString):
# component_text = sibling.string.strip()
# if component_text:
# array.append(component_text.rstrip(';'))
# else:
# return None
# return ' '.join(array)
# prettify
###################
### TARGET

@ -55,7 +55,7 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at
# get article content which contains all info about the spell
spellContent = spellSoup.find(id='article-content')
pbar.update(1)
# pbar.update(1)
###################
### ATTRIBUTES
@ -67,7 +67,7 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at
else :
spell_name = None
continue
# print("name: ",spell_name)
print("name: ",spell_name)
# get school and level
school_levels = spellContent.find('b',string="School").find_previous('p')
@ -135,23 +135,32 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at
# print('Effect: ',spell_effect)
# get description
spell_paragraphs = []
spell_description = spellContent.find('p',string='DESCRIPTION')
if not spell_description:
spell_description = spellSoup.find('div', {'class': 'page-center'}).find('p',string='DESCRIPTION')
if not spell_description:
spell_description = None
continue
spell_description = spell_description.find_next('p')
while spell_description and not spell_description.find_previous('div', {'class': 'section15'}):
if spell_description.has_attr('class'):
spell_paragraphs.append(spell_description.text)
description = spellContent.find('p', string='DESCRIPTION')
if description is None:
description = spellContent.find_next_sibling()
else:
spell_paragraphs.append(spell_description.text)
spell_description = spell_description.find_next('p')
if spell_description and spell_description.parent.name == 'div':
break
next_elem = description.find_next_sibling()
html = ''
while next_elem and not (next_elem.name == 'div' and 'section15' in next_elem.get('class', [])):
html += str(next_elem)
next_elem = next_elem.find_next_sibling()
# Convert HTML string to regular string
htmlString = str(html)
if "<h4" in htmlString:
# Trim HTML string
htmlString = htmlString.split("<h4")[0]
if "</p><div>" in htmlString:
last_p_index = htmlString.rfind("</p>")
if last_p_index != -1:
htmlString = htmlString[:last_p_index + 4]
# Convert back to BeautifulSoup object
spell_paragraphs = htmlString
# add all attributes to a spell dictionnary
spellz[spell_name] = {

Loading…
Cancel
Save