added scripts and latest outputs

main
Nicolas FRANCO 2 years ago
parent 7f6bb4a6f5
commit d43d67c8b7

@ -1,5 +1,8 @@
# Spell Scrapper :scroll: :snake:
## About this repository
This repository was built in order to have an "up to date" spells database for [SpellTastic](https://codefirst.iut.uca.fr/git/Spelltastic/Spelltastic), a cross-platform spell manager for Patfinder.
## Data source
All data is retrieved from [d20pfsrd](https://www.d20pfsrd.com/) the __#1 Pathfinder Roleplaying Game rules reference site__. All spells can be found at [spells](https://www.d20pfsrd.com/magic/all-spells/).

@ -0,0 +1,57 @@
import yaml
import sqlite3
def createDatabase():
connexion = sqlite3.connect('database/spells.db')
cursor = connexion.cursor()
cursor.execute('''DROP TABLE IF EXISTS spell''')
cursor.execute('''CREATE TABLE spell(
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT,
level TEXT,
school TEXT,
casting_time TEXT,
components TEXT,
range TEXT,
target TEXT,
area TEXT,
effect TEXT,
duration TEXT,
saving_throw TEXT,
spell_resistance TEXT,
description TEXT)''')
connexion.commit()
connexion.close()
def insertSpells():
with open('outputs/spells.yaml', 'r') as file:
spells = yaml.safe_load(file)
connexion = sqlite3.connect('assets/spells.db')
cursor = connexion.cursor()
for name, spell in spells.items():
level = spell.get('level')
school = spell.get('school')
casting_time = spell.get('casting_time')
range_ = spell.get('range')
target = spell.get('target')
duration = spell.get('duration')
saving_throw = spell.get('saving_throw')
spell_resistance = spell.get('spell_resistance')
description = '\n'.join(spell.get('description', []))
components = spell.get('components')
area = spell.get('area')
effect = spell.get('effect')
cursor.execute('''INSERT INTO spell(name, level, school, casting_time, components, range, target, area, effect, duration, saving_throw, spell_resistance, description)
VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
(name, level, school, casting_time, components, range_, target, area, effect, duration, saving_throw, spell_resistance, description))
connexion.commit()
connexion.close()
createDatabase()
insertSpells()

File diff suppressed because it is too large Load Diff

@ -0,0 +1,119 @@
# This file was used for debbuging purposes.
# It helped testing attributes separetly in
# order to better parse them or to optimize/
# factor some code.
import urllib.request
import requests
import bs4
from bs4 import BeautifulSoup
from lxml import html
URL = "https://www.d20pfsrd.com/magic/all-spells/d/death-from-below/"
responseDetails = requests.get(URL)
spellSoup = BeautifulSoup(responseDetails.content, 'lxml')
spellContent = spellSoup.find(id='article-content')
###################
### DESCRIPTION
###################
spell_description = spellContent.find('p',string='DESCRIPTION')
print("Desc separator: ", spell_description)
spell_paragraphs = []
spell_description = spellContent.find('p',string='DESCRIPTION')
if not spell_description:
spell_description = spellSoup.find('div', {'class': 'page-center'}).find('p',string='DESCRIPTION')
spell_description = spell_description.find_next('p')
while spell_description and not spell_description.find_previous('div', {'class': 'section15'}):
if spell_description.has_attr('class'):
spell_paragraphs.append(spell_description.text)
else:
spell_paragraphs.append(spell_description.text)
spell_description = spell_description.find_next('p')
if spell_description and spell_description.parent.name == 'div':
break
print("Spell description:\n", '\n\n'.join(spell_paragraphs))
def getStringSiblings(array, content, stop):
if content:
for sibling in content.next_siblings:
print(sibling)
if sibling.name == stop:
break
if sibling.name == 'a':
array.append(sibling.text)
elif isinstance((sibling), bs4.element.NavigableString):
component_text = sibling.string.strip()
if component_text:
array.append(component_text.rstrip(';'))
else:
return None
return ' '.join(array)
###################
### TARGET
###################
# target = []
# spell_target = spellContent.find('b',string="Target")
# spell_target = getStringSiblings(target, spell_target, 'b')
# print("Target: ", spell_target)
###################
### LEVELS
###################
# p = spellContent.find('b',string="School")
# print(p)
# p = p.find_previous('p')
# print(p)
# text = p.text
# print(text)
# parts = text.split("Level")
# school = parts[0].replace("School","").strip().strip(";")
# level = parts[1].replace("Level","").strip()
# print("---")
# print("School:", school)
# print("Level:", level)
# def getDescription(array, content):
# if content:
# content = content.find_next()
# array.append(content.text)
# if
# def getStringSiblings(array, content, stop):
# if content:
# for sibling in content.next_siblings:
# if sibling.name == stop:
# break
# if sibling.name == 'a':
# array.append(sibling.text)
# elif isinstance((sibling), bs4.element.NavigableString):
# component_text = sibling.string.strip()
# if component_text:
# array.append(component_text.rstrip(';'))
# else:
# return None
# return ' '.join(array)
###################
### DURATION
###################
# spell_duration = spellContent.find('b',string='Duration')
# if spell_duration:
# if spell_duration.next_sibling is not None:
# spell_duration = spell_duration.next_sibling.text.strip()
# else :
# print("fix here ---")
# print("first: ", spell_duration)
# spell_duration = spell_duration.find_next('br')
# print(spell_duration)
# else :
# spell_duration = None
# print("Duration: ",spell_duration)
# print("Duration: ",spell_duration)

@ -0,0 +1,187 @@
import urllib.request
import requests
import bs4
from bs4 import BeautifulSoup
from lxml import html
import yaml
### -------------------------------------
# GET ALL THE SPELLS FROM THE PAGE
### -------------------------------------
## GET <li> ELEMENTS NAME + URL TO DETAIL PAGE
# url with all spells
URL = "https://www.d20pfsrd.com/magic/all-spells/"
# get the page content using GET to url
response = requests.get(URL)
# parse html using
soup = BeautifulSoup(response.content, 'lxml')
list = soup.find(id='article-content').find_next('div',class_="flexbox")
# this gets all the <li> elements from the article-content div, which contain all of the
# spells (name and link to detail page)
lis = list.find_all('li')
###################
### METHODS
###################
def getStringSiblings(array, content, stop):
if content:
for sibling in content.next_siblings:
if sibling.name == stop:
break
if sibling.name == 'a':
array.append(sibling.text)
elif isinstance((sibling), bs4.element.NavigableString):
component_text = sibling.string.strip()
if component_text:
array.append(component_text.rstrip(';'))
else:
return None
return ' '.join(array)
cpt = 0
spellz = {}
for li in lis:
url = li.a['href']
## get html of details page
responseDetails = requests.get(url)
spellSoup = BeautifulSoup(responseDetails.content, 'lxml')
# get article content which contains all info about spells
spellContent = spellSoup.find(id='article-content')
###################
### ATTRIBUTES
###################
# get name
if spellContent :
spell_name = spellContent.find('h1').text
else :
spell_name = None
continue
print("name: ",spell_name)
# get school and level
school_levels = spellContent.find('b',string="School").find_previous('p')
text = school_levels.text
parts = text.split("Level")
spell_school = parts[0].replace("School","").strip().strip(";")
spell_level = parts[1].replace("Level","").strip()
print("School: ",spell_school)
print("Level:",spell_level)
# get casting time
castTime = []
spell_castTime = spellContent.find('b',string="Casting Time")
spell_castTime = getStringSiblings(castTime, spell_castTime, 'b')
print("Cast time: ", spell_castTime)
# get components
components = []
spell_components = spellContent.find('b', string='Components')#.next_sibling.strip()
spell_components = getStringSiblings(components, spell_components, 'p')
print ("Components: ", spell_components)
# get range
rangesp = []
spell_range = spellContent.find('b',string="Range")
spell_range = getStringSiblings(rangesp, spell_range, 'b')
print("Range: ", spell_range)
# get target
target = []
spell_target = spellContent.find('b',string="Target")
spell_target = getStringSiblings(target, spell_target, 'b')
print("Target: ", spell_target)
#get duration
duration = []
spell_duration = spellContent.find('b',string="Duration")
spell_duration = getStringSiblings(duration, spell_duration, 'b')
print("Duration: ",spell_duration)
# get saving throw
svthrow = []
spell_saving_throw = spellContent.find('b',string='Saving Throw')
spell_saving_throw = getStringSiblings(svthrow, spell_saving_throw, 'b')
print("Saving throw: ", spell_saving_throw)
# get resistance
resistance = []
spell_resistance = spellContent.find('b',string='Spell Resistance')
spell_resistance = getStringSiblings(resistance, spell_resistance, 'b')
print("Spell Resistance: ", spell_resistance)
# get area
area = []
spell_area = spellContent.find('b',string='Area')
spell_area = getStringSiblings(area, spell_area, 'b')
print("Area:", spell_area )
# get effect
effect = []
spell_effect = spellContent.find('b',string='Effect')
spell_effect = getStringSiblings(effect, spell_effect, 'b')
print('Effect: ',spell_effect)
# get description
spell_paragraphs = []
spell_description = spellContent.find('p',string='DESCRIPTION')
if not spell_description:
spell_description = spellSoup.find('div', {'class': 'page-center'}).find('p',string='DESCRIPTION')
if not spell_description:
spell_description = None
continue
spell_description = spell_description.find_next('p')
while spell_description and not spell_description.find_previous('div', {'class': 'section15'}):
if spell_description.has_attr('class'):
spell_paragraphs.append(spell_description.text)
else:
spell_paragraphs.append(spell_description.text)
spell_description = spell_description.find_next('p')
if spell_description and spell_description.parent.name == 'div':
break
print("Spell description:\n", '\n\n'.join(spell_paragraphs))
# print(" ----- ")
# print(" ")
# cpt += 1
# print("no: ",cpt)
spellz[spell_name] = {
'school': spell_school,
'level': spell_level,
'casting_time': spell_castTime,
'components': spell_components,
'range': spell_range,
'target': spell_target,
'duration': spell_duration,
'saving_throw': spell_saving_throw,
'spell_resistance': spell_resistance,
'area': spell_area,
'effect': spell_effect,
'description': spell_paragraphs
}
with open('outputs/spells.yaml', 'w') as f:
yaml.dump(spellz, f)
Loading…
Cancel
Save