]> git.tdb.fi Git - poefilter.git/commitdiff
Add a script to scrape armor and weapon data from the website
authorMikko Rasa <tdb@tdb.fi>
Wed, 15 Aug 2018 15:56:16 +0000 (18:56 +0300)
committerMikko Rasa <tdb@tdb.fi>
Wed, 15 Aug 2018 15:56:16 +0000 (18:56 +0300)
scrape-item-data.py [new file with mode: 0755]

diff --git a/scrape-item-data.py b/scrape-item-data.py
new file mode 100755 (executable)
index 0000000..3040063
--- /dev/null
@@ -0,0 +1,217 @@
+#!/usr/bin/python3
+
+import requests
+import html.parser
+import codecs
+
+class Item:
+       def __init__(self, name, kind):
+               self.name = name
+               self.kind = kind
+               self.droplevel = 0
+               self.upgrade = None
+
+       def is_upgrade_for(self, other):
+               return self.kind==other.kind and self.droplevel>other.droplevel
+
+class Armor(Item):
+       def __init__(self, name, kind):
+               super(Armor, self).__init__(name, kind)
+
+               self.armor = 0
+               self.evasion = 0
+               self.energy_shield = 0
+
+       def is_upgrade_for(self, other):
+               if (self.armor!=0)!=(other.armor!=0):
+                       return False
+               if (self.evasion!=0)!=(other.evasion!=0):
+                       return False
+               if (self.energy_shield!=0)!=(other.energy_shield!=0):
+                       return False
+
+               return super(Armor, self).is_upgrade_for(other)
+
+class Weapon(Item):
+       def __init__(self, name, kind):
+               super(Weapon, self).__init__(name, kind)
+
+               self.speed = 0
+               self.dps = 0
+
+class ItemDataParser(html.parser.HTMLParser):
+       def __init__(self):
+               super(ItemDataParser, self).__init__()
+
+               self.in_items_table = False
+               self.column = 0
+               self.in_cell = False
+               self.ignore_row = False
+               self.items = {}
+               self.current_item = None
+               self.in_heading = False
+               self.headings = []
+               self.current_heading = None
+
+       def handle_starttag(self, tag, attrs):
+               if tag=="table":
+                       for n, v in attrs:
+                               if n=="class" and v=="itemDataTable":
+                                       self.in_items_table = True
+               elif tag=="tr":
+                       self.ignore_row = False
+                       for n, v in attrs:
+                               if n=="class" and v.endswith("_mod"):
+                                       self.ignore_row = True
+                       self.column = 0
+                       self.current_item = None
+               elif tag=="td":
+                       self.column += 1
+                       self.in_cell = True
+               elif tag=="h1":
+                       self.in_heading = True
+
+       def handle_endtag(self, tag):
+               if tag=="table":
+                       self.in_items_table = False
+               elif tag=="td":
+                       self.in_cell = False
+               elif tag=="tr":
+                       if self.current_item:
+                               for it in self.items.values():
+                                       if not it.upgrade and self.current_item.is_upgrade_for(it):
+                                               it.upgrade = self.current_item
+                                               break
+               elif tag=="h1":
+                       self.in_heading = False
+
+       def handle_data(self, data):
+               data = data.strip()
+               if self.in_heading:
+                       self.current_heading = data
+                       self.headings.append(self.current_heading)
+               elif self.in_items_table and self.in_cell and data and not self.ignore_row:
+                       if self.column==2:
+                               self.current_item = self.create_item(data, self.current_heading)
+                               self.items[data] = self.current_item
+                       elif self.column==3:
+                               self.current_item.droplevel = int(data)
+                       elif self.column>=4:
+                               self.handle_value(self.column, data)
+
+       def create_item(self, name, kind):
+               pass
+
+       def handle_value(self, column, data):
+               pass
+
+class ArmorDataParser(ItemDataParser):
+       def create_item(self, name, kind):
+               return Armor(name, kind)
+
+       def handle_value(self, column, data):
+               if column==4:
+                       self.current_item.armor = int(data)
+               elif column==5:
+                       self.current_item.evasion = int(data)
+               elif column==6:
+                       self.current_item.energy_shield = int(data)
+
+class WeaponDataParser(ItemDataParser):
+       def create_item(self, name, kind):
+               return Weapon(name, kind)
+
+       def handle_value(self, column, data):
+               if column==5:
+                       self.current_item.speed = float(data)
+               elif column==6:
+                       self.current_item.dps = float(data)
+
+def write_best_category(out, prefix, items, steps):
+       best = ["best", "second"]
+
+       out.write('category "{}.{}_at_level"\n'.format(prefix, best[steps-1]))
+       out.write("{\n\tor\n\t{\n")
+       for it in items:
+               upgrade = it
+               for k in range(steps):
+                       upgrade = upgrade.upgrade
+                       if not upgrade:
+                               break
+               out.write("\t\tand\n\t\t{\n")
+               if upgrade:
+                       out.write('\t\t\tbase_type "{}";\n'.format(it.name))
+                       out.write("\t\t\titem_level {} {};\n".format(it.droplevel, upgrade.droplevel-1))
+               else:
+                       out.write('\t\t\tbase_type "{}";\n'.format(it.name))
+                       out.write("\t\t\tmin_item_level {};\n".format(it.droplevel))
+               out.write("\t\t};\n")
+       out.write("\t};\n};\n")
+
+def main():
+       r = requests.get("https://www.pathofexile.com/item-data/armour")
+       p = ArmorDataParser()
+       p.feed(codecs.decode(r.content, r.encoding))
+
+       types = {"robe": lambda i: (not i.armor and not i.evasion and i.energy_shield),
+               "cloth": lambda i: (not i.armor and i.evasion and i.energy_shield),
+               "leather": lambda i: (not i.armor and i.evasion and not i.energy_shield),
+               "scale": lambda i: (i.armor and i.evasion and not i.energy_shield),
+               "plate": lambda i: (i.armor and not i.evasion and not i.energy_shield),
+               "chain": lambda i: (i.armor and not i.evasion and i.energy_shield)}
+
+       out = open("armor.txt", "w")
+
+       for t, f in types.items():
+               items = [i for i in p.items.values() if f(i)]
+
+               for i in range(1, 3):
+                       write_best_category(out, "armor.{}".format(t), items, i)
+                       """out.write('category "armor.{}.{}_at_level"\n'.format(t, best[i-1]))
+                       out.write("{\n\tor\n\t{\n")
+                       for it in items:
+                               next_level = 0
+                               n = it.name
+                               for k in range(i):
+                                       n = p.upgrades.get(n, "")
+                               if n:
+                                       next_level = p.items[n].droplevel
+                               if next_level:
+                                       out.write("\t\tand\n\t\t{\n")
+                                       out.write('\t\t\tbase_type "{}";\n'.format(it.name))
+                                       out.write("\t\t\tmax_item_level {};\n".format(next_level-1))
+                                       out.write("\t\t};\n")
+                               else:
+                                       out.write('\t\tbase_type "{}";\n'.format(it.name))
+                       out.write("\t};\n};\n")"""
+
+               out.write('category "armor.{}"\n'.format(t))
+               out.write("{\n\tor\n\t{\n")
+               for it in items:
+                       out.write('\t\tbase_type "{}";\n'.format(it.name))
+               out.write("\t};\n};\n")
+
+       out.write('category "armor"\n{\n\tor\n\t{\n')
+       for h in p.headings:
+               out.write('\t\tclass "{}";\n'.format(h))
+       out.write("\t};\n};\n")
+
+       r = requests.get("https://www.pathofexile.com/item-data/weapon")
+       p = WeaponDataParser()
+       p.feed(codecs.decode(r.content, r.encoding))
+
+       out = open("weapons.txt", "w")
+
+       for h in p.headings:
+               items = [i for i in p.items.values() if i.kind==h]
+
+               for i in range(1, 3):
+                       write_best_category(out, "weapon.{}".format(h.lower().replace(' ', '_')), items, i)
+
+       out.write('category "weapon"\n{\n\tor\n\t{\n')
+       for h in p.headings:
+               out.write('\t\tclass "{}";\n'.format(h))
+       out.write("\t};\n};\n")
+
+if __name__=="__main__":
+       main()