]> git.tdb.fi Git - poefilter.git/commitdiff
Scrape skill gem data from the wiki
authorMikko Rasa <tdb@tdb.fi>
Fri, 17 Aug 2018 19:56:13 +0000 (22:56 +0300)
committerMikko Rasa <tdb@tdb.fi>
Fri, 17 Aug 2018 19:56:29 +0000 (22:56 +0300)
scrape-wiki-data.py

index a141feaa6cff0e0c11040472d9f544d5eef6276c..b12f8a67681aa79e1469c1184749eee4d6d05b25 100755 (executable)
@@ -1,5 +1,6 @@
 #!/usr/bin/python3
 
+import sys
 import requests
 import html.parser
 import codecs
@@ -22,18 +23,48 @@ class Flask:
                other_aps = other.amount/other.duration
                return (aps>other_aps and self.amount>other.amount)
 
+class SkillGem:
+       def __init__(self, name):
+               self.name = name
+               self.primary_attribute = None
+               self.secondary_attribute = None
+               self.price = None
+               self.vaal = False
+               self.drop_only = False
+
 class WikiParser(html.parser.HTMLParser):
        def __init__(self):
                super(WikiParser, self).__init__()
 
+               self.ignore = 0
+
+       def handle_starttag(self, tag, attrs):
+               if tag=="span":
+                       if self.ignore:
+                               self.ignore += 1
+                       else:
+                               for n, v in attrs:
+                                       if n=="class" and "c-item-hoverbox__display" in v:
+                                               self.ignore = 1
+
+       def handle_endtag(self, tag):
+               if tag=="span":
+                       if self.ignore:
+                               self.ignore -= 1
+
+class WikiTableParser(WikiParser):
+       def __init__(self):
+               super(WikiTableParser, self).__init__()
+
                self.in_items_table = False
                self.column = 0
                self.in_cell = False
-               self.ignore_data = 0
                self.items = []
                self.current_item = None
 
        def handle_starttag(self, tag, attrs):
+               super(WikiTableParser, self).handle_starttag(tag, attrs)
+
                if tag=="table":
                        for n, v in attrs:
                                if n=="class" and "wikitable" in v:
@@ -45,25 +76,17 @@ class WikiParser(html.parser.HTMLParser):
                        self.column += 1
                        if self.in_items_table:
                                self.in_cell = True
-               elif tag=="span":
-                       if self.ignore_data:
-                               self.ignore_data += 1
-                       else:
-                               for n, v in attrs:
-                                       if n=="class" and "c-item-hoverbox__display" in v:
-                                               self.ignore_data = 1
 
        def handle_endtag(self, tag):
+               super(WikiTableParser, self).handle_endtag(tag)
+
                if tag=="table":
                        self.in_items_table = False
                elif tag=="td":
                        self.in_cell = False
-               elif tag=="span":
-                       if self.ignore_data:
-                               self.ignore_data -= 1
 
        def handle_data(self, data):
-               if self.ignore_data:
+               if self.ignore:
                        return
 
                data = data.strip()
@@ -84,11 +107,11 @@ class WikiParser(html.parser.HTMLParser):
        def handle_value(self, column, data):
                pass
 
-class DivinationCardsParser(WikiParser):
+class DivinationCardsParser(WikiTableParser):
        def handle_starttag(self, tag, attrs):
                super(DivinationCardsParser, self).handle_starttag(tag, attrs)
 
-               if tag=="span" and not self.ignore_data:
+               if tag=="span" and not self.ignore:
                        if self.in_cell and self.current_item and not self.current_item.reward_kind and self.column==3:
                                for n, v in attrs:
                                        if n=="class":
@@ -120,7 +143,7 @@ class DivinationCardsParser(WikiParser):
                        elif "Map" in data:
                                self.current_item.reward_kind = "map"
 
-class FlasksParser(WikiParser):
+class FlasksParser(WikiTableParser):
        def __init__(self, utility=False):
                super(FlasksParser, self).__init__()
 
@@ -147,6 +170,148 @@ class FlasksParser(WikiParser):
                elif (column==4 and not self.utility) or (column==3 and self.utility):
                        self.current_item.duration = float(data)
 
+class SkillGemListParser(WikiParser):
+       def __init__(self):
+               super(SkillGemListParser, self).__init__()
+
+               self.in_subheading = False
+               self.in_list = False
+               self.links = []
+
+       def handle_starttag(self, tag, attrs):
+               super(SkillGemListParser, self).handle_starttag(tag, attrs)
+
+               if tag=="h2":
+                       self.in_subheading = True
+               elif tag=="a":
+                       if self.in_list and not self.ignore:
+                               for n, v in attrs:
+                                       if n=="href":
+                                               self.links.append(v)
+
+       def handle_endtag(self, tag):
+               super(SkillGemListParser, self).handle_endtag(tag)
+
+               if tag=="h2":
+                       self.in_subheading = False
+
+       def handle_data(self, data):
+               data = data.strip()
+
+               if self.in_subheading:
+                       self.in_list = (data=="List")
+
+class SkillGemParser(WikiParser):
+       prices = (("alchemy", "Orb of Alchemy"),
+               ("chance", "Orb of Chance"),
+               ("alteration", "Orb of Alteration"),
+               ("transmute", "Orb of Transmutation"),
+               ("wisdom", "Scroll of Wisdom"))
+
+       def __init__(self):
+               super(SkillGemParser, self).__init__()
+
+               self.item = None
+               self.in_heading = False
+               self.in_subheading = False
+               self.in_subheading = False
+               self.in_infobox = 0
+               self.in_box_heading = False
+               self.in_purchase = False
+               self.in_progression = False
+               self.in_acquisition = False
+               self.have_acquisition_data = False
+               self.row = 0
+               self.column = 0
+               self.attribute_reqs = []
+
+       def handle_starttag(self, tag, attrs):
+               super(SkillGemParser, self).handle_starttag(tag, attrs)
+
+               if tag=="h1":
+                       self.in_heading = True
+               elif tag=="h2":
+                       self.in_subheading = True
+               elif tag=="span":
+                       if self.in_infobox:
+                               self.in_infobox += 1
+                       else:
+                               for n, v in attrs:
+                                       if n=="class" and "item-box" in v:
+                                               self.in_infobox = 1
+               elif tag=="em":
+                       if self.in_infobox:
+                               for n, v in attrs:
+                                       if n=="class" and "header" in v:
+                                               self.in_purchase = False
+                                               self.in_box_heading = True
+               elif tag=="table":
+                       self.row = 0
+               elif tag=="tr":
+                       self.row += 1
+                       self.column = 0
+               elif tag=="td" or tag=="th":
+                       self.column += 1
+               elif tag=="img":
+                       if self.in_progression and self.column>=3:
+                               for n, v in attrs:
+                                       if n=="alt":
+                                               self.attribute_reqs.append([v, 0])
+               elif tag=="div":
+                       for n, v in attrs:
+                               if n=="id" and v=="footer":
+                                       if not self.have_acquisition_data:
+                                               self.item.drop_only = True
+
+       def handle_endtag(self, tag):
+               super(SkillGemParser, self).handle_endtag(tag)
+
+               if tag=="h1":
+                       self.in_heading = False
+               elif tag=="h2":
+                       self.in_subheading = False
+               elif tag=="span":
+                       if self.in_infobox:
+                               self.in_infobox -= 1
+               elif tag=="em":
+                       self.in_box_heading = False
+               elif tag=="tr":
+                       if self.in_progression and self.row==2 and self.attribute_reqs:
+                               self.attribute_reqs.sort(key=lambda r: r[1])
+                               self.item.primary_attribute = self.attribute_reqs[0][0]
+                               if len(self.attribute_reqs)>1:
+                                       self.item.secondary_attribute = self.attribute_reqs[1][0]
+
+       def handle_data(self, data):
+               data = data.strip()
+               if not data:
+                       return
+
+               if self.in_heading:
+                       name = data
+                       paren = name.find('(')
+                       if paren>=0:
+                               name = name[:paren].strip()
+                       self.item = SkillGem(name)
+                       if name.startswith("Vaal"):
+                               self.item.vaal = True
+               elif self.in_subheading:
+                       self.in_progression = ("progression" in data)
+                       self.in_acquisition = ("acquisition" in data)
+               elif self.in_box_heading:
+                       if "Purchase" in data:
+                               self.in_purchase = True
+               elif self.in_purchase:
+                       if self.column==2:
+                               for p, n in SkillGemParser.prices:
+                                       if n==data:
+                                               self.item.price = p
+               elif self.in_progression:
+                       if self.row==2 and self.column>=3 and self.column<3+len(self.attribute_reqs):
+                               self.attribute_reqs[self.column-3][1] = int(data)
+               elif self.in_acquisition:
+                       self.have_acquisition_data = True
+
 def scrape_flasks(out, url, kind):
        r = requests.get(url)
        p = FlasksParser()
@@ -198,5 +363,52 @@ def main():
                out.write('\t\tbase_type "{}";\n'.format(it.name))
        out.write("\t};\n};\n")
 
+       r = requests.get("https://pathofexile.gamepedia.com/List_of_skill_gems")
+       p = SkillGemListParser()
+       p.feed(codecs.decode(r.content, r.encoding))
+
+       gems = []
+
+       prefix = ""
+       for l in p.links:
+               r2 = requests.get("https://pathofexile.gamepedia.com"+l)
+               p2 = SkillGemParser()
+               p2.feed(codecs.decode(r2.content, r2.encoding))
+               gems.append(p2.item)
+
+               sys.stdout.write(prefix+"{}\n".format(p2.item.name))
+               prefix = "\033[1A\033[K"
+
+       sys.stdout.write(prefix)
+
+       out = open("skillgems.txt", "w")
+       out.write('category "skillgem.vaal"\n{\n\tclass "Skill Gem";\n\tor\n\t{\n')
+       for g in gems:
+               if g.vaal:
+                       out.write('\t\tbase_type "{}";\n'.format(g.name))
+       out.write("\t};\n};\n")
+
+       out.write('category "skillgem.drop"\n{\n\tclass "Skill Gem";\n\tor\n\t{\n')
+       for g in gems:
+               if (g.drop_only or not g.price) and not g.vaal:
+                       out.write('\t\tbase_type "{}";\n'.format(g.name))
+       out.write("\t};\n};\n")
+
+       for p, n in SkillGemParser.prices:
+               out.write('category "skillgem.{}"\n'.format(p))
+               out.write('{\n\tclass "Skill Gem";\n\tor\n\t{\n')
+               for g in gems:
+                       if g.price==p:
+                               out.write('\t\tbase_type "{}";\n'.format(g.name))
+               out.write("\t};\n};\n")
+
+       for a in ["strength", "dexterity", "intelligence"]:
+               out.write('category "skillgem.{}"\n'.format(a))
+               out.write('{\n\tclass "Skill Gem";\n\tor\n\t{\n')
+               for g in gems:
+                       if g.primary_attribute==a:
+                               out.write('\t\tbase_type "{}";\n'.format(g.name))
+               out.write("\t};\n};\n")
+
 if __name__=="__main__":
        main()