From: Mikko Rasa Date: Fri, 17 Aug 2018 15:16:54 +0000 (+0300) Subject: Add another script to scrape more item data from the wiki X-Git-Url: http://git.tdb.fi/?p=poefilter.git;a=commitdiff_plain;h=f3e6c70bc6dd25a1bf3f2924c92ba6ffdc49fea5 Add another script to scrape more item data from the wiki --- diff --git a/scrape-wiki-data.py b/scrape-wiki-data.py new file mode 100755 index 0000000..a141fea --- /dev/null +++ b/scrape-wiki-data.py @@ -0,0 +1,202 @@ +#!/usr/bin/python3 + +import requests +import html.parser +import codecs + +class Card: + def __init__(self, name): + self.name = name + self.reward_kind = None + +class Flask: + def __init__(self, name): + self.name = name + self.droplevel = 0 + self.amount = 0.0 + self.duration = 0.0 + self.upgrade = None + + def is_upgrade_for(self, other): + aps = self.amount/self.duration + other_aps = other.amount/other.duration + return (aps>other_aps and self.amount>other.amount) + +class WikiParser(html.parser.HTMLParser): + def __init__(self): + super(WikiParser, self).__init__() + + self.in_items_table = False + self.column = 0 + self.in_cell = False + self.ignore_data = 0 + self.items = [] + self.current_item = None + + def handle_starttag(self, tag, attrs): + if tag=="table": + for n, v in attrs: + if n=="class" and "wikitable" in v: + self.in_items_table = True + elif tag=="tr": + self.column = 0 + self.current_item = None + elif tag=="td": + self.column += 1 + if self.in_items_table: + self.in_cell = True + elif tag=="span": + if self.ignore_data: + self.ignore_data += 1 + else: + for n, v in attrs: + if n=="class" and "c-item-hoverbox__display" in v: + self.ignore_data = 1 + + def handle_endtag(self, tag): + if tag=="table": + self.in_items_table = False + elif tag=="td": + self.in_cell = False + elif tag=="span": + if self.ignore_data: + self.ignore_data -= 1 + + def handle_data(self, data): + if self.ignore_data: + return + + data = data.strip() + if not data: + return + + if self.in_cell: + if self.column==1: + self.current_item = self.create_item(data) + if self.current_item: + self.items.append(self.current_item) + elif self.current_item: + self.handle_value(self.column, data) + + def create_item(self, name): + pass + + def handle_value(self, column, data): + pass + +class DivinationCardsParser(WikiParser): + def handle_starttag(self, tag, attrs): + super(DivinationCardsParser, self).handle_starttag(tag, attrs) + + if tag=="span" and not self.ignore_data: + if self.in_cell and self.current_item and not self.current_item.reward_kind and self.column==3: + for n, v in attrs: + if n=="class": + if "-currency" in v: + self.current_item.reward_kind = "currency" + elif "-unique" in v: + self.current_item.reward_kind = "unique" + elif "-rare" in v: + self.current_item.reward_kind = "rare" + elif "-magic" in v or "-mod" in v: + self.current_item.reward_kind = "magic" + elif "-gem" in v: + self.current_item.reward_kind = "skillgem" + + def handle_endtag(self, tag): + super(DivinationCardsParser, self).handle_endtag(tag) + + if tag=="tr": + if self.current_item and not self.current_item.reward_kind: + self.current_item.reward_kind = "other" + + def create_item(self, name): + return Card(name) + + def handle_value(self, column, data): + if column==3: + if "-Link" in data: + self.current_item.reward_kind = "links" + elif "Map" in data: + self.current_item.reward_kind = "map" + +class FlasksParser(WikiParser): + def __init__(self, utility=False): + super(FlasksParser, self).__init__() + + self.utility = utility + + def handle_endtag(self, tag): + super(FlasksParser, self).handle_endtag(tag) + + if tag=="tr": + if self.current_item: + for it in self.items: + if not it.upgrade and self.current_item.is_upgrade_for(it): + it.upgrade = self.current_item + + def create_item(self, name): + if name.endswith("Flask"): + return Flask(name) + + def handle_value(self, column, data): + if column==2: + self.current_item.droplevel = int(data) + elif column==3 and not self.utility: + self.current_item.amount = int(data) + elif (column==4 and not self.utility) or (column==3 and self.utility): + self.current_item.duration = float(data) + +def scrape_flasks(out, url, kind): + r = requests.get(url) + p = FlasksParser() + p.feed(codecs.decode(r.content, r.encoding)) + + out.write('category "flask.{}.best_at_level"\n'.format(kind)) + out.write('{\n\tclass "Flask";\n\tor\n\t{\n') + for it in p.items: + out.write("\t\tand\n\t\t{\n") + out.write('\t\t\tbase_type "{}";\n'.format(it.name)) + if it.upgrade: + out.write("\t\t\titem_level {} {};\n".format(it.droplevel, it.upgrade.droplevel-1)) + else: + out.write("\t\t\tmin_item_level {};\n".format(it.droplevel)) + out.write("\t\t};\n") + out.write("\t};\n};\n") + +def main(): + r = requests.get("https://pathofexile.gamepedia.com/List_of_divination_cards") + p = DivinationCardsParser() + p.feed(codecs.decode(r.content, r.encoding)) + + by_reward = {} + + for it in p.items: + by_reward.setdefault(it.reward_kind, []).append(it) + + out = open("cards.txt", "w") + for r, il in by_reward.items(): + out.write('category "card.{}"\n'.format(r)) + out.write('{\n\tclass "Card";\n\tor\n\t{\n') + for it in il: + out.write('\t\tbase_type "{}";\n'.format(it.name)) + out.write("\t};\n};\n") + + out = open("flasks.txt", "w") + scrape_flasks(out, "https://pathofexile.gamepedia.com/Life_Flasks", "life") + scrape_flasks(out, "https://pathofexile.gamepedia.com/Mana_Flasks", "mana") + scrape_flasks(out, "https://pathofexile.gamepedia.com/Hybrid_Flasks", "hybrid") + + p = FlasksParser(True) + r = requests.get("https://pathofexile.gamepedia.com/Utility_Flasks") + p.feed(codecs.decode(r.content, r.encoding)) + r = requests.get("https://pathofexile.gamepedia.com/Critical_Utility_Flasks") + p.feed(codecs.decode(r.content, r.encoding)) + + out.write('category "flask.utility"\n{\n\tclass "Flask";\n\tor\n\t{\n') + for it in p.items: + out.write('\t\tbase_type "{}";\n'.format(it.name)) + out.write("\t};\n};\n") + +if __name__=="__main__": + main()