new description working + updated outputs

2 years ago · 936d185bcd
parent 3bde5a9363
commit 936d185bcd
4 changed files with 74228 additions and 7 deletions
--- a/outputs/spells.db
+++ b/outputs/spells.db
--- a/outputs/spells.yaml
+++ b/outputs/spells.yaml
--- a/scrapping/one_page_scrap.py
+++ b/scrapping/one_page_scrap.py
@ -11,17 +11,21 @@ from lxml import html
 # URL = "https://www.d20pfsrd.com/magic/all-spells/m/magic-circle-against-evil/"
 # URL = "https://www.d20pfsrd.com/magic/all-spells/p/prophetic-lore/"
-URL = "https://www.d20pfsrd.com/magic/all-spells/t/time-stop/"
+URL = "https://www.d20pfsrd.com/magic/all-spells/d/death-from-below/"
 responseDetails = requests.get(URL)
 spellSoup = BeautifulSoup(responseDetails.content, 'lxml')
 spellContent = spellSoup.find(id='article-content')
 description = spellContent.find('p', string='DESCRIPTION')
 if description is None:
    description = spellContent.find_next_sibling()
-next_elem = description.find_next_sibling()
+next_elem = description.find_next('p')
-html = ''
+html= ''
 while next_elem and not (next_elem.name == 'div' and 'section15' in next_elem.get('class', [])):
    print("ne;", next_elem )
    print()
    html += str(next_elem)
    next_elem = next_elem.find_next_sibling()
--- a/scrapping/scrap-spells.py
+++ b/scrapping/scrap-spells.py
@ -44,8 +44,7 @@ def getStringSiblings(array, content, stop):
    return ' '.join(array)
 spellz = {}
-# pbar = tqdm(total=2650, desc="[Processing]", unit=" spell")
+pbar = tqdm(total=2650, desc="[Processing]", unit=" spell")
 cpt = 0
 for li in lis: # now we loop over all spells, get the page link and scrap all attributes
    url = li.a['href']
@ -137,8 +136,11 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at
    # get description 
    description = spellContent.find('p', string='DESCRIPTION')
-    if(description):
+    if description is None:
        description = spellContent.find_next_sibling()
    else:
        next_elem = description.find_next_sibling()
    html = ''
    while next_elem and not (next_elem.name == 'div' and 'section15' in next_elem.get('class', [])):
        html += str(next_elem)
@ -179,7 +181,7 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at
 with open('outputs/spells.yaml', 'w') as f:
    yaml.dump(spellz, f)
-# pbar.close() 
+pbar.close()