From: Mikko Rasa Date: Fri, 17 Aug 2018 19:56:13 +0000 (+0300) Subject: Scrape skill gem data from the wiki X-Git-Url: http://git.tdb.fi/?p=poefilter.git;a=commitdiff_plain;h=3cc07334fa50893efa1aa3d607b9a95ae557ce7a Scrape skill gem data from the wiki --- diff --git a/scrape-wiki-data.py b/scrape-wiki-data.py index a141fea..b12f8a6 100755 --- a/scrape-wiki-data.py +++ b/scrape-wiki-data.py @@ -1,5 +1,6 @@ #!/usr/bin/python3 +import sys import requests import html.parser import codecs @@ -22,18 +23,48 @@ class Flask: other_aps = other.amount/other.duration return (aps>other_aps and self.amount>other.amount) +class SkillGem: + def __init__(self, name): + self.name = name + self.primary_attribute = None + self.secondary_attribute = None + self.price = None + self.vaal = False + self.drop_only = False + class WikiParser(html.parser.HTMLParser): def __init__(self): super(WikiParser, self).__init__() + self.ignore = 0 + + def handle_starttag(self, tag, attrs): + if tag=="span": + if self.ignore: + self.ignore += 1 + else: + for n, v in attrs: + if n=="class" and "c-item-hoverbox__display" in v: + self.ignore = 1 + + def handle_endtag(self, tag): + if tag=="span": + if self.ignore: + self.ignore -= 1 + +class WikiTableParser(WikiParser): + def __init__(self): + super(WikiTableParser, self).__init__() + self.in_items_table = False self.column = 0 self.in_cell = False - self.ignore_data = 0 self.items = [] self.current_item = None def handle_starttag(self, tag, attrs): + super(WikiTableParser, self).handle_starttag(tag, attrs) + if tag=="table": for n, v in attrs: if n=="class" and "wikitable" in v: @@ -45,25 +76,17 @@ class WikiParser(html.parser.HTMLParser): self.column += 1 if self.in_items_table: self.in_cell = True - elif tag=="span": - if self.ignore_data: - self.ignore_data += 1 - else: - for n, v in attrs: - if n=="class" and "c-item-hoverbox__display" in v: - self.ignore_data = 1 def handle_endtag(self, tag): + super(WikiTableParser, self).handle_endtag(tag) + if tag=="table": self.in_items_table = False elif tag=="td": self.in_cell = False - elif tag=="span": - if self.ignore_data: - self.ignore_data -= 1 def handle_data(self, data): - if self.ignore_data: + if self.ignore: return data = data.strip() @@ -84,11 +107,11 @@ class WikiParser(html.parser.HTMLParser): def handle_value(self, column, data): pass -class DivinationCardsParser(WikiParser): +class DivinationCardsParser(WikiTableParser): def handle_starttag(self, tag, attrs): super(DivinationCardsParser, self).handle_starttag(tag, attrs) - if tag=="span" and not self.ignore_data: + if tag=="span" and not self.ignore: if self.in_cell and self.current_item and not self.current_item.reward_kind and self.column==3: for n, v in attrs: if n=="class": @@ -120,7 +143,7 @@ class DivinationCardsParser(WikiParser): elif "Map" in data: self.current_item.reward_kind = "map" -class FlasksParser(WikiParser): +class FlasksParser(WikiTableParser): def __init__(self, utility=False): super(FlasksParser, self).__init__() @@ -147,6 +170,148 @@ class FlasksParser(WikiParser): elif (column==4 and not self.utility) or (column==3 and self.utility): self.current_item.duration = float(data) +class SkillGemListParser(WikiParser): + def __init__(self): + super(SkillGemListParser, self).__init__() + + self.in_subheading = False + self.in_list = False + self.links = [] + + def handle_starttag(self, tag, attrs): + super(SkillGemListParser, self).handle_starttag(tag, attrs) + + if tag=="h2": + self.in_subheading = True + elif tag=="a": + if self.in_list and not self.ignore: + for n, v in attrs: + if n=="href": + self.links.append(v) + + def handle_endtag(self, tag): + super(SkillGemListParser, self).handle_endtag(tag) + + if tag=="h2": + self.in_subheading = False + + def handle_data(self, data): + data = data.strip() + + if self.in_subheading: + self.in_list = (data=="List") + +class SkillGemParser(WikiParser): + prices = (("alchemy", "Orb of Alchemy"), + ("chance", "Orb of Chance"), + ("alteration", "Orb of Alteration"), + ("transmute", "Orb of Transmutation"), + ("wisdom", "Scroll of Wisdom")) + + def __init__(self): + super(SkillGemParser, self).__init__() + + self.item = None + self.in_heading = False + self.in_subheading = False + self.in_subheading = False + self.in_infobox = 0 + self.in_box_heading = False + self.in_purchase = False + self.in_progression = False + self.in_acquisition = False + self.have_acquisition_data = False + self.row = 0 + self.column = 0 + self.attribute_reqs = [] + + def handle_starttag(self, tag, attrs): + super(SkillGemParser, self).handle_starttag(tag, attrs) + + if tag=="h1": + self.in_heading = True + elif tag=="h2": + self.in_subheading = True + elif tag=="span": + if self.in_infobox: + self.in_infobox += 1 + else: + for n, v in attrs: + if n=="class" and "item-box" in v: + self.in_infobox = 1 + elif tag=="em": + if self.in_infobox: + for n, v in attrs: + if n=="class" and "header" in v: + self.in_purchase = False + self.in_box_heading = True + elif tag=="table": + self.row = 0 + elif tag=="tr": + self.row += 1 + self.column = 0 + elif tag=="td" or tag=="th": + self.column += 1 + elif tag=="img": + if self.in_progression and self.column>=3: + for n, v in attrs: + if n=="alt": + self.attribute_reqs.append([v, 0]) + elif tag=="div": + for n, v in attrs: + if n=="id" and v=="footer": + if not self.have_acquisition_data: + self.item.drop_only = True + + def handle_endtag(self, tag): + super(SkillGemParser, self).handle_endtag(tag) + + if tag=="h1": + self.in_heading = False + elif tag=="h2": + self.in_subheading = False + elif tag=="span": + if self.in_infobox: + self.in_infobox -= 1 + elif tag=="em": + self.in_box_heading = False + elif tag=="tr": + if self.in_progression and self.row==2 and self.attribute_reqs: + self.attribute_reqs.sort(key=lambda r: r[1]) + self.item.primary_attribute = self.attribute_reqs[0][0] + if len(self.attribute_reqs)>1: + self.item.secondary_attribute = self.attribute_reqs[1][0] + + def handle_data(self, data): + data = data.strip() + if not data: + return + + if self.in_heading: + name = data + paren = name.find('(') + if paren>=0: + name = name[:paren].strip() + self.item = SkillGem(name) + if name.startswith("Vaal"): + self.item.vaal = True + elif self.in_subheading: + self.in_progression = ("progression" in data) + self.in_acquisition = ("acquisition" in data) + elif self.in_box_heading: + if "Purchase" in data: + self.in_purchase = True + elif self.in_purchase: + if self.column==2: + for p, n in SkillGemParser.prices: + if n==data: + self.item.price = p + elif self.in_progression: + if self.row==2 and self.column>=3 and self.column<3+len(self.attribute_reqs): + self.attribute_reqs[self.column-3][1] = int(data) + elif self.in_acquisition: + self.have_acquisition_data = True + def scrape_flasks(out, url, kind): r = requests.get(url) p = FlasksParser() @@ -198,5 +363,52 @@ def main(): out.write('\t\tbase_type "{}";\n'.format(it.name)) out.write("\t};\n};\n") + r = requests.get("https://pathofexile.gamepedia.com/List_of_skill_gems") + p = SkillGemListParser() + p.feed(codecs.decode(r.content, r.encoding)) + + gems = [] + + prefix = "" + for l in p.links: + r2 = requests.get("https://pathofexile.gamepedia.com"+l) + p2 = SkillGemParser() + p2.feed(codecs.decode(r2.content, r2.encoding)) + gems.append(p2.item) + + sys.stdout.write(prefix+"{}\n".format(p2.item.name)) + prefix = "\033[1A\033[K" + + sys.stdout.write(prefix) + + out = open("skillgems.txt", "w") + out.write('category "skillgem.vaal"\n{\n\tclass "Skill Gem";\n\tor\n\t{\n') + for g in gems: + if g.vaal: + out.write('\t\tbase_type "{}";\n'.format(g.name)) + out.write("\t};\n};\n") + + out.write('category "skillgem.drop"\n{\n\tclass "Skill Gem";\n\tor\n\t{\n') + for g in gems: + if (g.drop_only or not g.price) and not g.vaal: + out.write('\t\tbase_type "{}";\n'.format(g.name)) + out.write("\t};\n};\n") + + for p, n in SkillGemParser.prices: + out.write('category "skillgem.{}"\n'.format(p)) + out.write('{\n\tclass "Skill Gem";\n\tor\n\t{\n') + for g in gems: + if g.price==p: + out.write('\t\tbase_type "{}";\n'.format(g.name)) + out.write("\t};\n};\n") + + for a in ["strength", "dexterity", "intelligence"]: + out.write('category "skillgem.{}"\n'.format(a)) + out.write('{\n\tclass "Skill Gem";\n\tor\n\t{\n') + for g in gems: + if g.primary_attribute==a: + out.write('\t\tbase_type "{}";\n'.format(g.name)) + out.write("\t};\n};\n") + if __name__=="__main__": main()