fixed db spell conversion 🔧

new description working + updated outputs
rework description alg ♻️
6 changed files with 41545 additions and 16924 deletions
--- a/README.md
+++ b/README.md
@ -1,12 +1,12 @@
 # Spell Scrapper :scroll: :snake:

 ## About this repository
-This repository was built in order to have an "up to date" spells database for [SpellTastic](https://codefirst.iut.uca.fr/git/Spelltastic/Spelltastic), a cross-platform spell manager for Pathfinder.
+This repository was built in order to have an "up to date" spells database for [SpellTastic](https://codefirst.iut.uca.fr/git/Spelltastic/Spelltastic), a cross-platform spell manager for Patfinder.

 ## Data source
 All data is retrieved from [d20pfsrd](https://www.d20pfsrd.com/) the __#1 Pathfinder Roleplaying Game rules reference site__. All spells can be found at [spells](https://www.d20pfsrd.com/magic/all-spells/).

-The latest data extracted is available as a YAML file and can be found in the `outputs` directory.
+The latest data extracted is available as a YAML file.

 ## Getting Started

@ -37,10 +37,10 @@ pip install requests beautifulsoup4 lxml pyyaml
 ```
 python scrapping/scrap-spells.py
 ```
+A progress bar should be displayed in your terminal indicating the time left and the number of spells scraped.
+__The script should takes about 20 minutes to scrap all spells__

-4. This command will generate a file __spells.yaml__ with all spells and their attributes. The file should be found in the `outputs` directory.  
-
-_A progress bar should be displayed in your terminal while scrapping, showing the time left and the number of spells scraped. The script should takes about 20 minutes to scrap all spells._
+4. This command will generate a file __spells.yaml__ with all spells and their attributes

 ### Database
 5. You can build a __.db__ sqlite3 databse file by running the __spell_db.py__ file:
@ -48,4 +48,4 @@ _A progress bar should be displayed in your terminal while scrapping, showing th
 python database/spell-db.py
 ```

-6. The script will generate a __spells.db__ file with a spell table containing all the spell information. This file should also be found in the `outputs` directory.
+6. The script will create a __spells.db__ file with a spell table containing all the spell information.
--- a/database/spell-dp.py
+++ b/database/spell-dp.py
@ -41,7 +41,7 @@ def insertSpells():
        duration = spell.get('duration')
        saving_throw = spell.get('saving_throw')
        spell_resistance = spell.get('spell_resistance')
-        description = '\n'.join(spell.get('description', []))
+        description = spell.get('description')
        components = spell.get('components')
        area = spell.get('area')
        effect = spell.get('effect')
--- a/outputs/spells.db
+++ b/outputs/spells.db
--- a/outputs/spells.yaml
+++ b/outputs/spells.yaml
--- a/scrapping/one_page_scrap.py
+++ b/scrapping/one_page_scrap.py
@ -9,62 +9,80 @@ import bs4
 from bs4 import BeautifulSoup
 from lxml import html

-URL = "https://www.d20pfsrd.com/magic/all-spells/a/ant-haul/"
+# URL = "https://www.d20pfsrd.com/magic/all-spells/m/magic-circle-against-evil/"
+# URL = "https://www.d20pfsrd.com/magic/all-spells/p/prophetic-lore/"
+URL = "https://www.d20pfsrd.com/magic/all-spells/d/death-from-below/"

 responseDetails = requests.get(URL)
 spellSoup = BeautifulSoup(responseDetails.content, 'lxml')
 spellContent = spellSoup.find(id='article-content')

+description = spellContent.find('p', string='DESCRIPTION')
+if description is None:
+    description = spellContent.find_next_sibling()
+
+next_elem = description.find_next('p')
+html= ''
+while next_elem and not (next_elem.name == 'div' and 'section15' in next_elem.get('class', [])):
+    print("ne;", next_elem )
+    print()
+    html += str(next_elem)
+    next_elem = next_elem.find_next_sibling()
+
+# Convert HTML string to regular string
+htmlString = str(html)
+
+if "<h4" in htmlString:
+    # Trim HTML string
+    htmlString = htmlString.split("<h4")[0]
+
+if "</p><div>" in htmlString:
+    last_p_index = htmlString.rfind("</p>")
+
+    if last_p_index != -1:
+        htmlString = htmlString[:last_p_index + 4]
+
+# Convert back to BeautifulSoup object
+trimmed_html = BeautifulSoup(htmlString, 'html.parser')
+str(trimmed_html)
+trimmed_html.prettify()
+print(trimmed_html)
+
 ###################
 ### LEVELS
 ###################
-school_levels = spellContent.find('b',string="School").find_previous('p')
-text = school_levels.text
-parts = text.split("Level")
-spell_school = parts[0].replace("School","").strip().strip(";")
-spell_level = parts[1].replace("Level","").strip().split(";")[0]
+# school_levels = spellContent.find('b',string="School").find_previous('p')
+# text = school_levels.text
+# parts = text.split("Level")
+# spell_school = parts[0].replace("School","").strip().strip(";")
+# spell_level = parts[1].replace("Level","").strip().split(";")[0]

-print("level: ", spell_level)
-print("school: ", spell_school)
+# print("level: ", spell_level)
+# print("school: ", spell_school)

 ###################
 ### DESCRIPTION
 ###################
-# spell_description = spellContent.find('p',string='DESCRIPTION')
-# print("Desc separator: ", spell_description)      
-
 # spell_paragraphs = []
 # spell_description = spellContent.find('p',string='DESCRIPTION')
 # if not spell_description:
 #     spell_description = spellSoup.find('div', {'class': 'page-center'}).find('p',string='DESCRIPTION')
-# spell_description = spell_description.find_next('p')
+# if not spell_description:
+#     spell_description = None
+#     exit # change to continue
+# spell_description = spell_description.find_next()
+# # check if spell description is a table
+# if spell_description.name == 'table':
+#     spell_paragraphs.append(spell_description)
+# spell_description = spell_description.find_next()

-# while spell_description and not spell_description.find_previous('div', {'class': 'section15'}):
-#     if spell_description.has_attr('class'):
-#         spell_paragraphs.append(spell_description.text)
-#     else:
-#         spell_paragraphs.append(spell_description.text)
-#     spell_description = spell_description.find_next('p')
-#     if spell_description and spell_description.parent.name == 'div':
-#         break
+# print("paragraphs: ", spell_paragraphs)
+# print("description: ", spell_description)
+# Find the parent tag of the <p> tag with text "DESCRIPTION"

-# print("Spell description:\n", '\n\n'.join(spell_paragraphs))

-# def getStringSiblings(array, content, stop):
-#     if content:
-#         for sibling in content.next_siblings:
-#             print(sibling)
-#             if sibling.name == stop:
-#                 break
-#             if sibling.name == 'a':
-#                 array.append(sibling.text)
-#             elif isinstance((sibling), bs4.element.NavigableString):
-#                 component_text = sibling.string.strip()
-#                 if component_text:
-#                     array.append(component_text.rstrip(';'))
-#     else:
-#         return None
-#     return ' '.join(array)
+# prettify
+

 ###################
 ### TARGET
--- a/scrapping/scrap-spells.py
+++ b/scrapping/scrap-spells.py
@ -55,7 +55,7 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at

    # get article content which contains all info about the spell
    spellContent = spellSoup.find(id='article-content')
-    pbar.update(1)
+    # pbar.update(1)

 ###################
 ### ATTRIBUTES
@ -67,7 +67,7 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at
    else :
        spell_name = None
        continue
-    # print("name: ",spell_name)
+    print("name: ",spell_name)

    # get school and level
    school_levels = spellContent.find('b',string="School").find_previous('p')
@ -135,23 +135,32 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at
    # print('Effect: ',spell_effect)

    # get description 
-    spell_paragraphs = []
-    spell_description = spellContent.find('p',string='DESCRIPTION')
-    if not spell_description:
-        spell_description = spellSoup.find('div', {'class': 'page-center'}).find('p',string='DESCRIPTION')
-    if not spell_description:
-        spell_description = None
-        continue
-    spell_description = spell_description.find_next('p')
-
-    while spell_description and not spell_description.find_previous('div', {'class': 'section15'}):
-        if spell_description.has_attr('class'):
-            spell_paragraphs.append(spell_description.text)
+    description = spellContent.find('p', string='DESCRIPTION')
+    if description is None:
+        description = spellContent.find_next_sibling()
    else:
-            spell_paragraphs.append(spell_description.text)
-        spell_description = spell_description.find_next('p')
-        if spell_description and spell_description.parent.name == 'div':
-            break
+        next_elem = description.find_next_sibling()
+
+    html = ''
+    while next_elem and not (next_elem.name == 'div' and 'section15' in next_elem.get('class', [])):
+        html += str(next_elem)
+        next_elem = next_elem.find_next_sibling()
+
+    # Convert HTML string to regular string
+    htmlString = str(html)
+
+    if "<h4" in htmlString:
+        # Trim HTML string
+        htmlString = htmlString.split("<h4")[0]
+
+    if "</p><div>" in htmlString:
+        last_p_index = htmlString.rfind("</p>")
+
+        if last_p_index != -1:
+            htmlString = htmlString[:last_p_index + 4]
+
+    # Convert back to BeautifulSoup object
+    spell_paragraphs = htmlString

    # add all attributes to a spell dictionnary
    spellz[spell_name] = {
Author	SHA1	Message	Date
Nicolas FRANCO	c435e05f28	fixed db spell conversion 🔧	2 years ago
Nicolas FRANCO	936d185bcd	new description working + updated outputs	2 years ago
Nicolas FRANCO	3bde5a9363	rework description alg ♻️	2 years ago