From: Mikko Rasa Date: Wed, 15 Aug 2018 15:56:16 +0000 (+0300) Subject: Add a script to scrape armor and weapon data from the website X-Git-Url: http://git.tdb.fi/?p=poefilter.git;a=commitdiff_plain;h=980a6425ae617f403010b70e170805b755feb98f Add a script to scrape armor and weapon data from the website --- diff --git a/scrape-item-data.py b/scrape-item-data.py new file mode 100755 index 0000000..3040063 --- /dev/null +++ b/scrape-item-data.py @@ -0,0 +1,217 @@ +#!/usr/bin/python3 + +import requests +import html.parser +import codecs + +class Item: + def __init__(self, name, kind): + self.name = name + self.kind = kind + self.droplevel = 0 + self.upgrade = None + + def is_upgrade_for(self, other): + return self.kind==other.kind and self.droplevel>other.droplevel + +class Armor(Item): + def __init__(self, name, kind): + super(Armor, self).__init__(name, kind) + + self.armor = 0 + self.evasion = 0 + self.energy_shield = 0 + + def is_upgrade_for(self, other): + if (self.armor!=0)!=(other.armor!=0): + return False + if (self.evasion!=0)!=(other.evasion!=0): + return False + if (self.energy_shield!=0)!=(other.energy_shield!=0): + return False + + return super(Armor, self).is_upgrade_for(other) + +class Weapon(Item): + def __init__(self, name, kind): + super(Weapon, self).__init__(name, kind) + + self.speed = 0 + self.dps = 0 + +class ItemDataParser(html.parser.HTMLParser): + def __init__(self): + super(ItemDataParser, self).__init__() + + self.in_items_table = False + self.column = 0 + self.in_cell = False + self.ignore_row = False + self.items = {} + self.current_item = None + self.in_heading = False + self.headings = [] + self.current_heading = None + + def handle_starttag(self, tag, attrs): + if tag=="table": + for n, v in attrs: + if n=="class" and v=="itemDataTable": + self.in_items_table = True + elif tag=="tr": + self.ignore_row = False + for n, v in attrs: + if n=="class" and v.endswith("_mod"): + self.ignore_row = True + self.column = 0 + self.current_item = None + elif tag=="td": + self.column += 1 + self.in_cell = True + elif tag=="h1": + self.in_heading = True + + def handle_endtag(self, tag): + if tag=="table": + self.in_items_table = False + elif tag=="td": + self.in_cell = False + elif tag=="tr": + if self.current_item: + for it in self.items.values(): + if not it.upgrade and self.current_item.is_upgrade_for(it): + it.upgrade = self.current_item + break + elif tag=="h1": + self.in_heading = False + + def handle_data(self, data): + data = data.strip() + if self.in_heading: + self.current_heading = data + self.headings.append(self.current_heading) + elif self.in_items_table and self.in_cell and data and not self.ignore_row: + if self.column==2: + self.current_item = self.create_item(data, self.current_heading) + self.items[data] = self.current_item + elif self.column==3: + self.current_item.droplevel = int(data) + elif self.column>=4: + self.handle_value(self.column, data) + + def create_item(self, name, kind): + pass + + def handle_value(self, column, data): + pass + +class ArmorDataParser(ItemDataParser): + def create_item(self, name, kind): + return Armor(name, kind) + + def handle_value(self, column, data): + if column==4: + self.current_item.armor = int(data) + elif column==5: + self.current_item.evasion = int(data) + elif column==6: + self.current_item.energy_shield = int(data) + +class WeaponDataParser(ItemDataParser): + def create_item(self, name, kind): + return Weapon(name, kind) + + def handle_value(self, column, data): + if column==5: + self.current_item.speed = float(data) + elif column==6: + self.current_item.dps = float(data) + +def write_best_category(out, prefix, items, steps): + best = ["best", "second"] + + out.write('category "{}.{}_at_level"\n'.format(prefix, best[steps-1])) + out.write("{\n\tor\n\t{\n") + for it in items: + upgrade = it + for k in range(steps): + upgrade = upgrade.upgrade + if not upgrade: + break + out.write("\t\tand\n\t\t{\n") + if upgrade: + out.write('\t\t\tbase_type "{}";\n'.format(it.name)) + out.write("\t\t\titem_level {} {};\n".format(it.droplevel, upgrade.droplevel-1)) + else: + out.write('\t\t\tbase_type "{}";\n'.format(it.name)) + out.write("\t\t\tmin_item_level {};\n".format(it.droplevel)) + out.write("\t\t};\n") + out.write("\t};\n};\n") + +def main(): + r = requests.get("https://www.pathofexile.com/item-data/armour") + p = ArmorDataParser() + p.feed(codecs.decode(r.content, r.encoding)) + + types = {"robe": lambda i: (not i.armor and not i.evasion and i.energy_shield), + "cloth": lambda i: (not i.armor and i.evasion and i.energy_shield), + "leather": lambda i: (not i.armor and i.evasion and not i.energy_shield), + "scale": lambda i: (i.armor and i.evasion and not i.energy_shield), + "plate": lambda i: (i.armor and not i.evasion and not i.energy_shield), + "chain": lambda i: (i.armor and not i.evasion and i.energy_shield)} + + out = open("armor.txt", "w") + + for t, f in types.items(): + items = [i for i in p.items.values() if f(i)] + + for i in range(1, 3): + write_best_category(out, "armor.{}".format(t), items, i) + """out.write('category "armor.{}.{}_at_level"\n'.format(t, best[i-1])) + out.write("{\n\tor\n\t{\n") + for it in items: + next_level = 0 + n = it.name + for k in range(i): + n = p.upgrades.get(n, "") + if n: + next_level = p.items[n].droplevel + if next_level: + out.write("\t\tand\n\t\t{\n") + out.write('\t\t\tbase_type "{}";\n'.format(it.name)) + out.write("\t\t\tmax_item_level {};\n".format(next_level-1)) + out.write("\t\t};\n") + else: + out.write('\t\tbase_type "{}";\n'.format(it.name)) + out.write("\t};\n};\n")""" + + out.write('category "armor.{}"\n'.format(t)) + out.write("{\n\tor\n\t{\n") + for it in items: + out.write('\t\tbase_type "{}";\n'.format(it.name)) + out.write("\t};\n};\n") + + out.write('category "armor"\n{\n\tor\n\t{\n') + for h in p.headings: + out.write('\t\tclass "{}";\n'.format(h)) + out.write("\t};\n};\n") + + r = requests.get("https://www.pathofexile.com/item-data/weapon") + p = WeaponDataParser() + p.feed(codecs.decode(r.content, r.encoding)) + + out = open("weapons.txt", "w") + + for h in p.headings: + items = [i for i in p.items.values() if i.kind==h] + + for i in range(1, 3): + write_best_category(out, "weapon.{}".format(h.lower().replace(' ', '_')), items, i) + + out.write('category "weapon"\n{\n\tor\n\t{\n') + for h in p.headings: + out.write('\t\tclass "{}";\n'.format(h)) + out.write("\t};\n};\n") + +if __name__=="__main__": + main()