rework description alg ♻️

2 years ago · 3bde5a9363
parent 003bd32c8f
commit 3bde5a9363
3 changed files with 77 additions and 49677 deletions
--- a/outputs/spells.yaml
+++ b/outputs/spells.yaml
--- a/scrapping/one_page_scrap.py
+++ b/scrapping/one_page_scrap.py
@ -9,62 +9,76 @@ import bs4
 from bs4 import BeautifulSoup
 from lxml import html

-URL = "https://www.d20pfsrd.com/magic/all-spells/a/ant-haul/"
+# URL = "https://www.d20pfsrd.com/magic/all-spells/m/magic-circle-against-evil/"
+# URL = "https://www.d20pfsrd.com/magic/all-spells/p/prophetic-lore/"
+URL = "https://www.d20pfsrd.com/magic/all-spells/t/time-stop/"

 responseDetails = requests.get(URL)
 spellSoup = BeautifulSoup(responseDetails.content, 'lxml')
 spellContent = spellSoup.find(id='article-content')

+description = spellContent.find('p', string='DESCRIPTION')
+
+next_elem = description.find_next_sibling()
+html = ''
+while next_elem and not (next_elem.name == 'div' and 'section15' in next_elem.get('class', [])):
+    html += str(next_elem)
+    next_elem = next_elem.find_next_sibling()
+
+# Convert HTML string to regular string
+htmlString = str(html)
+
+if "<h4" in htmlString:
+    # Trim HTML string
+    htmlString = htmlString.split("<h4")[0]
+
+if "</p><div>" in htmlString:
+    last_p_index = htmlString.rfind("</p>")
+
+    if last_p_index != -1:
+        htmlString = htmlString[:last_p_index + 4]
+
+# Convert back to BeautifulSoup object
+trimmed_html = BeautifulSoup(htmlString, 'html.parser')
+str(trimmed_html)
+trimmed_html.prettify()
+print(trimmed_html)
+
 ###################
 ### LEVELS
 ###################
-school_levels = spellContent.find('b',string="School").find_previous('p')
-text = school_levels.text
-parts = text.split("Level")
-spell_school = parts[0].replace("School","").strip().strip(";")
-spell_level = parts[1].replace("Level","").strip().split(";")[0]
+# school_levels = spellContent.find('b',string="School").find_previous('p')
+# text = school_levels.text
+# parts = text.split("Level")
+# spell_school = parts[0].replace("School","").strip().strip(";")
+# spell_level = parts[1].replace("Level","").strip().split(";")[0]

-print("level: ", spell_level)
-print("school: ", spell_school)
+# print("level: ", spell_level)
+# print("school: ", spell_school)

 ###################
 ### DESCRIPTION
 ###################
-# spell_description = spellContent.find('p',string='DESCRIPTION')
-# print("Desc separator: ", spell_description)      
-
 # spell_paragraphs = []
 # spell_description = spellContent.find('p',string='DESCRIPTION')
 # if not spell_description:
 #     spell_description = spellSoup.find('div', {'class': 'page-center'}).find('p',string='DESCRIPTION')
-# spell_description = spell_description.find_next('p')
+# if not spell_description:
+#     spell_description = None
+#     exit # change to continue
+# spell_description = spell_description.find_next()
+# # check if spell description is a table
+# if spell_description.name == 'table':
+#     spell_paragraphs.append(spell_description)
+# spell_description = spell_description.find_next()

-# while spell_description and not spell_description.find_previous('div', {'class': 'section15'}):
-#     if spell_description.has_attr('class'):
-#         spell_paragraphs.append(spell_description.text)
-#     else:
-#         spell_paragraphs.append(spell_description.text)
-#     spell_description = spell_description.find_next('p')
-#     if spell_description and spell_description.parent.name == 'div':
-#         break
+# print("paragraphs: ", spell_paragraphs)
+# print("description: ", spell_description)
+# Find the parent tag of the <p> tag with text "DESCRIPTION"

-# print("Spell description:\n", '\n\n'.join(spell_paragraphs))

-# def getStringSiblings(array, content, stop):
-#     if content:
-#         for sibling in content.next_siblings:
-#             print(sibling)
-#             if sibling.name == stop:
-#                 break
-#             if sibling.name == 'a':
-#                 array.append(sibling.text)
-#             elif isinstance((sibling), bs4.element.NavigableString):
-#                 component_text = sibling.string.strip()
-#                 if component_text:
-#                     array.append(component_text.rstrip(';'))
-#     else:
-#         return None
-#     return ' '.join(array)
+# prettify
+

 ###################
 ### TARGET
--- a/scrapping/scrap-spells.py
+++ b/scrapping/scrap-spells.py
@ -44,7 +44,8 @@ def getStringSiblings(array, content, stop):
    return ' '.join(array)

 spellz = {}
-pbar = tqdm(total=2650, desc="[Processing]", unit=" spell")
+# pbar = tqdm(total=2650, desc="[Processing]", unit=" spell")
+cpt = 0

 for li in lis: # now we loop over all spells, get the page link and scrap all attributes
    url = li.a['href']
@ -55,7 +56,7 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at

    # get article content which contains all info about the spell
    spellContent = spellSoup.find(id='article-content')
-    pbar.update(1)
+    # pbar.update(1)

 ###################
 ### ATTRIBUTES
@ -67,7 +68,7 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at
    else :
        spell_name = None
        continue
-    # print("name: ",spell_name)
+    print("name: ",spell_name)

    # get school and level
    school_levels = spellContent.find('b',string="School").find_previous('p')
@ -135,23 +136,29 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at
    # print('Effect: ',spell_effect)

    # get description 
-    spell_paragraphs = []
-    spell_description = spellContent.find('p',string='DESCRIPTION')
-    if not spell_description:
-        spell_description = spellSoup.find('div', {'class': 'page-center'}).find('p',string='DESCRIPTION')
-    if not spell_description:
-        spell_description = None
-        continue
-    spell_description = spell_description.find_next('p')
+    description = spellContent.find('p', string='DESCRIPTION')
+    if(description):
+        next_elem = description.find_next_sibling()
+    html = ''
+    while next_elem and not (next_elem.name == 'div' and 'section15' in next_elem.get('class', [])):
+        html += str(next_elem)
+        next_elem = next_elem.find_next_sibling()
+
+    # Convert HTML string to regular string
+    htmlString = str(html)
+
+    if "<h4" in htmlString:
+        # Trim HTML string
+        htmlString = htmlString.split("<h4")[0]
+
+    if "</p><div>" in htmlString:
+        last_p_index = htmlString.rfind("</p>")
+
+        if last_p_index != -1:
+            htmlString = htmlString[:last_p_index + 4]

-    while spell_description and not spell_description.find_previous('div', {'class': 'section15'}):
-        if spell_description.has_attr('class'):
-            spell_paragraphs.append(spell_description.text)
-        else:
-            spell_paragraphs.append(spell_description.text)
-        spell_description = spell_description.find_next('p')
-        if spell_description and spell_description.parent.name == 'div':
-            break
+    # Convert back to BeautifulSoup object
+    spell_paragraphs = htmlString

    # add all attributes to a spell dictionnary
    spellz[spell_name] = {
@ -172,7 +179,7 @@ for li in lis: # now we loop over all spells, get the page link and scrap all at
 with open('outputs/spells.yaml', 'w') as f:
    yaml.dump(spellz, f)

-pbar.close() 
+# pbar.close()