readme adjustements 📝

6 changed files with 16924 additions and 41545 deletions
--- a/README.md
+++ b/README.md
@ -1,12 +1,12 @@
 # Spell Scrapper :scroll: :snake:

 ## About this repository
-This repository was built in order to have an "up to date" spells database for [SpellTastic](https://codefirst.iut.uca.fr/git/Spelltastic/Spelltastic), a cross-platform spell manager for Patfinder.
+This repository was built in order to have an "up to date" spells database for [SpellTastic](https://codefirst.iut.uca.fr/git/Spelltastic/Spelltastic), a cross-platform spell manager for Pathfinder.

 ## Data source
 All data is retrieved from [d20pfsrd](https://www.d20pfsrd.com/) the __#1 Pathfinder Roleplaying Game rules reference site__. All spells can be found at [spells](https://www.d20pfsrd.com/magic/all-spells/).

-The latest data extracted is available as a YAML file.
+The latest data extracted is available as a YAML file and can be found in the `outputs` directory.

 ## Getting Started

@ -37,10 +37,10 @@ pip install requests beautifulsoup4 lxml pyyaml
 ```
 python scrapping/scrap-spells.py
 ```
-A progress bar should be displayed in your terminal indicating the time left and the number of spells scraped.
-__The script should takes about 20 minutes to scrap all spells__

-4. This command will generate a file __spells.yaml__ with all spells and their attributes
+4. This command will generate a file __spells.yaml__ with all spells and their attributes. The file should be found in the `outputs` directory.  
+
+_A progress bar should be displayed in your terminal while scrapping, showing the time left and the number of spells scraped. The script should takes about 20 minutes to scrap all spells._

 ### Database
 5. You can build a __.db__ sqlite3 databse file by running the __spell_db.py__ file:
@ -48,4 +48,4 @@ __The script should takes about 20 minutes to scrap all spells__
 python database/spell-db.py
 ```

-6. The script will create a __spells.db__ file with a spell table containing all the spell information.
+6. The script will generate a __spells.db__ file with a spell table containing all the spell information. This file should also be found in the `outputs` directory.
--- a/database/spell-dp.py
+++ b/database/spell-dp.py
@ -41,7 +41,7 @@ def insertSpells():
        duration = spell.get('duration')
        saving_throw = spell.get('saving_throw')
        spell_resistance = spell.get('spell_resistance')
-        description = spell.get('description')
+        description = '\n'.join(spell.get('description', []))
        components = spell.get('components')
        area = spell.get('area')
        effect = spell.get('effect')
--- a/outputs/spells.db
+++ b/outputs/spells.db
--- a/outputs/spells.yaml
+++ b/outputs/spells.yaml
--- a/scrapping/one_page_scrap.py
+++ b/scrapping/one_page_scrap.py
@ -9,80 +9,62 @@ import bs4
 from bs4 import BeautifulSoup
 from lxml import html

-# URL = "https://www.d20pfsrd.com/magic/all-spells/m/magic-circle-against-evil/"
-# URL = "https://www.d20pfsrd.com/magic/all-spells/p/prophetic-lore/"
-URL = "https://www.d20pfsrd.com/magic/all-spells/d/death-from-below/"
+URL = "https://www.d20pfsrd.com/magic/all-spells/a/ant-haul/"

 responseDetails = requests.get(URL)
 spellSoup = BeautifulSoup(responseDetails.content, 'lxml')
 spellContent = spellSoup.find(id='article-content')

-description = spellContent.find('p', string='DESCRIPTION')
-if description is None:
-    description = spellContent.find_next_sibling()
-
-next_elem = description.find_next('p')
-html= ''
-while next_elem and not (next_elem.name == 'div' and 'section15' in next_elem.get('class', [])):
-    print("ne;", next_elem )
-    print()
-    html += str(next_elem)
-    next_elem = next_elem.find_next_sibling()
-
-# Convert HTML string to regular string
-htmlString = str(html)
-
-if "<h4" in htmlString:
-    # Trim HTML string
-    htmlString = htmlString.split("<h4")[0]
-
-if "</p><div>" in htmlString:
-    last_p_index = htmlString.rfind("</p>")
-
-    if last_p_index != -1:
-        htmlString = htmlString[:last_p_index + 4]
-
-# Convert back to BeautifulSoup object
-trimmed_html = BeautifulSoup(htmlString, 'html.parser')
-str(trimmed_html)
-trimmed_html.prettify()
-print(trimmed_html)
-
 ###################
 ### LEVELS
 ###################
-# school_levels = spellContent.find('b',string="School").find_previous('p')
-# text = school_levels.text
-# parts = text.split("Level")
-# spell_school = parts[0].replace("School","").strip().strip(";")
-# spell_level = parts[1].replace("Level","").strip().split(";")[0]
+school_levels = spellContent.find('b',string="School").find_previous('p')
+text = school_levels.text
+parts = text.split("Level")
+spell_school = parts[0].replace("School","").strip().strip(";")
+spell_level = parts[1].replace("Level","").strip().split(";")[0]

-# print("level: ", spell_level)
-# print("school: ", spell_school)
+print("level: ", spell_level)
+print("school: ", spell_school)

 ###################
 ### DESCRIPTION
 ###################
+# spell_description = spellContent.find('p',string='DESCRIPTION')
+# print("Desc separator: ", spell_description)      
+
 # spell_paragraphs = []
 # spell_description = spellContent.find('p',string='DESCRIPTION')
 # if not spell_description:
 #     spell_description = spellSoup.find('div', {'class': 'page-center'}).find('p',string='DESCRIPTION')
-# if not spell_description:
-#     spell_description = None
-#     exit # change to continue
-# spell_description = spell_description.find_next()
-# # check if spell description is a table
-# if spell_description.name == 'table':
-#     spell_paragraphs.append(spell_description)
-# spell_description = spell_description.find_next()
-
-# print("paragraphs: ", spell_paragraphs)
-# print("description: ", spell_description)
-# Find the parent tag of the <p> tag with text "DESCRIPTION"
+# spell_description = spell_description.find_next('p')

+# while spell_description and not spell_description.find_previous('div', {'class': 'section15'}):
+#     if spell_description.has_attr('class'):
+#         spell_paragraphs.append(spell_description.text)
+#     else:
+#         spell_paragraphs.append(spell_description.text)
+#     spell_description = spell_description.find_next('p')
+#     if spell_description and spell_description.parent.name == 'div':
+#         break

-# prettify
+# print("Spell description:\n", '\n\n'.join(spell_paragraphs))

+# def getStringSiblings(array, content, stop):
+#     if content:
+#         for sibling in content.next_siblings:
+#             print(sibling)
+#             if sibling.name == stop:
+#                 break
+#             if sibling.name == 'a':
+#                 array.append(sibling.text)
+#             elif isinstance((sibling), bs4.element.NavigableString):
+#                 component_text = sibling.string.strip()
+#                 if component_text:
+#                     array.append(component_text.rstrip(';'))
+#     else:
+#         return None
+#     return ' '.join(array)

 ###################
 ### TARGET
--- a/scrapping/scrap-spells.py
+++ b/scrapping/scrap-spells.py
@ -55,7 +55,7 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at

    # get article content which contains all info about the spell
    spellContent = spellSoup.find(id='article-content')
-    # pbar.update(1)
+    pbar.update(1)

 ###################
 ### ATTRIBUTES
@ -67,7 +67,7 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at
    else :
        spell_name = None
        continue
-    print("name: ",spell_name)
+    # print("name: ",spell_name)

    # get school and level
    school_levels = spellContent.find('b',string="School").find_previous('p')
@ -135,32 +135,23 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at
    # print('Effect: ',spell_effect)

    # get description 
-    description = spellContent.find('p', string='DESCRIPTION')
-    if description is None:
-        description = spellContent.find_next_sibling()
-    else:
-        next_elem = description.find_next_sibling()
-
-    html = ''
-    while next_elem and not (next_elem.name == 'div' and 'section15' in next_elem.get('class', [])):
-        html += str(next_elem)
-        next_elem = next_elem.find_next_sibling()
-
-    # Convert HTML string to regular string
-    htmlString = str(html)
-
-    if "<h4" in htmlString:
-        # Trim HTML string
-        htmlString = htmlString.split("<h4")[0]
-
-    if "</p><div>" in htmlString:
-        last_p_index = htmlString.rfind("</p>")
-
-        if last_p_index != -1:
-            htmlString = htmlString[:last_p_index + 4]
+    spell_paragraphs = []
+    spell_description = spellContent.find('p',string='DESCRIPTION')
+    if not spell_description:
+        spell_description = spellSoup.find('div', {'class': 'page-center'}).find('p',string='DESCRIPTION')
+    if not spell_description:
+        spell_description = None
+        continue
+    spell_description = spell_description.find_next('p')

-    # Convert back to BeautifulSoup object
-    spell_paragraphs = htmlString
+    while spell_description and not spell_description.find_previous('div', {'class': 'section15'}):
+        if spell_description.has_attr('class'):
+            spell_paragraphs.append(spell_description.text)
+        else:
+            spell_paragraphs.append(spell_description.text)
+        spell_description = spell_description.find_next('p')
+        if spell_description and spell_description.parent.name == 'div':
+            break

    # add all attributes to a spell dictionnary
    spellz[spell_name] = {
Author	SHA1	Message	Date
Nicolas FRANCO	b94d5ddf1d	readme adjustements 📝	2 years ago
Nicolas FRANCO	ee80cef94b	readme adjustements 📝	2 years ago