diff --git a/scrapping/one_attribute_scrap.py b/scrapping/one_attribute_scrap.py new file mode 100644 index 0000000..e69de29 diff --git a/scrapping/one_page_scrap.py b/scrapping/one_page_scrap.py index 2d9a661..c88bdab 100644 --- a/scrapping/one_page_scrap.py +++ b/scrapping/one_page_scrap.py @@ -9,50 +9,62 @@ import bs4 from bs4 import BeautifulSoup from lxml import html -URL = "https://www.d20pfsrd.com/magic/all-spells/d/death-from-below/" +URL = "https://www.d20pfsrd.com/magic/all-spells/a/accelerate-poison/" responseDetails = requests.get(URL) spellSoup = BeautifulSoup(responseDetails.content, 'lxml') spellContent = spellSoup.find(id='article-content') +################### +### LEVELS +################### +school_levels = spellContent.find('b',string="School").find_previous('p') +text = school_levels.text +parts = text.split("Level") +spell_school = parts[0].replace("School","").strip().strip(";") +spell_level = parts[1].replace("Level","").strip().split(";")[0] + +print("level: ", spell_level) +print("school: ", spell_school) + ################### ### DESCRIPTION ################### -spell_description = spellContent.find('p',string='DESCRIPTION') -print("Desc separator: ", spell_description) +# spell_description = spellContent.find('p',string='DESCRIPTION') +# print("Desc separator: ", spell_description) -spell_paragraphs = [] -spell_description = spellContent.find('p',string='DESCRIPTION') -if not spell_description: - spell_description = spellSoup.find('div', {'class': 'page-center'}).find('p',string='DESCRIPTION') -spell_description = spell_description.find_next('p') +# spell_paragraphs = [] +# spell_description = spellContent.find('p',string='DESCRIPTION') +# if not spell_description: +# spell_description = spellSoup.find('div', {'class': 'page-center'}).find('p',string='DESCRIPTION') +# spell_description = spell_description.find_next('p') -while spell_description and not spell_description.find_previous('div', {'class': 'section15'}): - if spell_description.has_attr('class'): - spell_paragraphs.append(spell_description.text) - else: - spell_paragraphs.append(spell_description.text) - spell_description = spell_description.find_next('p') - if spell_description and spell_description.parent.name == 'div': - break +# while spell_description and not spell_description.find_previous('div', {'class': 'section15'}): +# if spell_description.has_attr('class'): +# spell_paragraphs.append(spell_description.text) +# else: +# spell_paragraphs.append(spell_description.text) +# spell_description = spell_description.find_next('p') +# if spell_description and spell_description.parent.name == 'div': +# break -print("Spell description:\n", '\n\n'.join(spell_paragraphs)) +# print("Spell description:\n", '\n\n'.join(spell_paragraphs)) -def getStringSiblings(array, content, stop): - if content: - for sibling in content.next_siblings: - print(sibling) - if sibling.name == stop: - break - if sibling.name == 'a': - array.append(sibling.text) - elif isinstance((sibling), bs4.element.NavigableString): - component_text = sibling.string.strip() - if component_text: - array.append(component_text.rstrip(';')) - else: - return None - return ' '.join(array) +# def getStringSiblings(array, content, stop): +# if content: +# for sibling in content.next_siblings: +# print(sibling) +# if sibling.name == stop: +# break +# if sibling.name == 'a': +# array.append(sibling.text) +# elif isinstance((sibling), bs4.element.NavigableString): +# component_text = sibling.string.strip() +# if component_text: +# array.append(component_text.rstrip(';')) +# else: +# return None +# return ' '.join(array) ################### ### TARGET @@ -62,23 +74,6 @@ def getStringSiblings(array, content, stop): # spell_target = getStringSiblings(target, spell_target, 'b') # print("Target: ", spell_target) -################### -### LEVELS -################### -# p = spellContent.find('b',string="School") -# print(p) -# p = p.find_previous('p') -# print(p) -# text = p.text -# print(text) -# parts = text.split("Level") -# school = parts[0].replace("School","").strip().strip(";") -# level = parts[1].replace("Level","").strip() - -# print("---") -# print("School:", school) -# print("Level:", level) - # def getDescription(array, content): # if content: # content = content.find_next()