]> git.tdb.fi Git - poefilter.git/commitdiff
Add another script to scrape more item data from the wiki
authorMikko Rasa <tdb@tdb.fi>
Fri, 17 Aug 2018 15:16:54 +0000 (18:16 +0300)
committerMikko Rasa <tdb@tdb.fi>
Fri, 17 Aug 2018 15:16:54 +0000 (18:16 +0300)
scrape-wiki-data.py [new file with mode: 0755]

diff --git a/scrape-wiki-data.py b/scrape-wiki-data.py
new file mode 100755 (executable)
index 0000000..a141fea
--- /dev/null
@@ -0,0 +1,202 @@
+#!/usr/bin/python3
+
+import requests
+import html.parser
+import codecs
+
+class Card:
+       def __init__(self, name):
+               self.name = name
+               self.reward_kind = None
+
+class Flask:
+       def __init__(self, name):
+               self.name = name
+               self.droplevel = 0
+               self.amount = 0.0
+               self.duration = 0.0
+               self.upgrade = None
+
+       def is_upgrade_for(self, other):
+               aps = self.amount/self.duration
+               other_aps = other.amount/other.duration
+               return (aps>other_aps and self.amount>other.amount)
+
+class WikiParser(html.parser.HTMLParser):
+       def __init__(self):
+               super(WikiParser, self).__init__()
+
+               self.in_items_table = False
+               self.column = 0
+               self.in_cell = False
+               self.ignore_data = 0
+               self.items = []
+               self.current_item = None
+
+       def handle_starttag(self, tag, attrs):
+               if tag=="table":
+                       for n, v in attrs:
+                               if n=="class" and "wikitable" in v:
+                                       self.in_items_table = True
+               elif tag=="tr":
+                       self.column = 0
+                       self.current_item = None
+               elif tag=="td":
+                       self.column += 1
+                       if self.in_items_table:
+                               self.in_cell = True
+               elif tag=="span":
+                       if self.ignore_data:
+                               self.ignore_data += 1
+                       else:
+                               for n, v in attrs:
+                                       if n=="class" and "c-item-hoverbox__display" in v:
+                                               self.ignore_data = 1
+
+       def handle_endtag(self, tag):
+               if tag=="table":
+                       self.in_items_table = False
+               elif tag=="td":
+                       self.in_cell = False
+               elif tag=="span":
+                       if self.ignore_data:
+                               self.ignore_data -= 1
+
+       def handle_data(self, data):
+               if self.ignore_data:
+                       return
+
+               data = data.strip()
+               if not data:
+                       return
+
+               if self.in_cell:
+                       if self.column==1:
+                               self.current_item = self.create_item(data)
+                               if self.current_item:
+                                       self.items.append(self.current_item)
+                       elif self.current_item:
+                               self.handle_value(self.column, data)
+
+       def create_item(self, name):
+               pass
+
+       def handle_value(self, column, data):
+               pass
+
+class DivinationCardsParser(WikiParser):
+       def handle_starttag(self, tag, attrs):
+               super(DivinationCardsParser, self).handle_starttag(tag, attrs)
+
+               if tag=="span" and not self.ignore_data:
+                       if self.in_cell and self.current_item and not self.current_item.reward_kind and self.column==3:
+                               for n, v in attrs:
+                                       if n=="class":
+                                               if "-currency" in v:
+                                                       self.current_item.reward_kind = "currency"
+                                               elif "-unique" in v:
+                                                       self.current_item.reward_kind = "unique"
+                                               elif "-rare" in v:
+                                                       self.current_item.reward_kind = "rare"
+                                               elif "-magic" in v or "-mod" in v:
+                                                       self.current_item.reward_kind = "magic"
+                                               elif "-gem" in v:
+                                                       self.current_item.reward_kind = "skillgem"
+
+       def handle_endtag(self, tag):
+               super(DivinationCardsParser, self).handle_endtag(tag)
+
+               if tag=="tr":
+                       if self.current_item and not self.current_item.reward_kind:
+                               self.current_item.reward_kind = "other"
+
+       def create_item(self, name):
+               return Card(name)
+
+       def handle_value(self, column, data):
+               if column==3:
+                       if "-Link" in data:
+                               self.current_item.reward_kind = "links"
+                       elif "Map" in data:
+                               self.current_item.reward_kind = "map"
+
+class FlasksParser(WikiParser):
+       def __init__(self, utility=False):
+               super(FlasksParser, self).__init__()
+
+               self.utility = utility
+
+       def handle_endtag(self, tag):
+               super(FlasksParser, self).handle_endtag(tag)
+
+               if tag=="tr":
+                       if self.current_item:
+                               for it in self.items:
+                                       if not it.upgrade and self.current_item.is_upgrade_for(it):
+                                               it.upgrade = self.current_item
+
+       def create_item(self, name):
+               if name.endswith("Flask"):
+                       return Flask(name)
+
+       def handle_value(self, column, data):
+               if column==2:
+                       self.current_item.droplevel = int(data)
+               elif column==3 and not self.utility:
+                       self.current_item.amount = int(data)
+               elif (column==4 and not self.utility) or (column==3 and self.utility):
+                       self.current_item.duration = float(data)
+
+def scrape_flasks(out, url, kind):
+       r = requests.get(url)
+       p = FlasksParser()
+       p.feed(codecs.decode(r.content, r.encoding))
+
+       out.write('category "flask.{}.best_at_level"\n'.format(kind))
+       out.write('{\n\tclass "Flask";\n\tor\n\t{\n')
+       for it in p.items:
+               out.write("\t\tand\n\t\t{\n")
+               out.write('\t\t\tbase_type "{}";\n'.format(it.name))
+               if it.upgrade:
+                       out.write("\t\t\titem_level {} {};\n".format(it.droplevel, it.upgrade.droplevel-1))
+               else:
+                       out.write("\t\t\tmin_item_level {};\n".format(it.droplevel))
+               out.write("\t\t};\n")
+       out.write("\t};\n};\n")
+
+def main():
+       r = requests.get("https://pathofexile.gamepedia.com/List_of_divination_cards")
+       p = DivinationCardsParser()
+       p.feed(codecs.decode(r.content, r.encoding))
+
+       by_reward = {}
+
+       for it in p.items:
+               by_reward.setdefault(it.reward_kind, []).append(it)
+
+       out = open("cards.txt", "w")
+       for r, il in by_reward.items():
+               out.write('category "card.{}"\n'.format(r))
+               out.write('{\n\tclass "Card";\n\tor\n\t{\n')
+               for it in il:
+                       out.write('\t\tbase_type "{}";\n'.format(it.name))
+               out.write("\t};\n};\n")
+
+       out = open("flasks.txt", "w")
+       scrape_flasks(out, "https://pathofexile.gamepedia.com/Life_Flasks", "life")
+       scrape_flasks(out, "https://pathofexile.gamepedia.com/Mana_Flasks", "mana")
+       scrape_flasks(out, "https://pathofexile.gamepedia.com/Hybrid_Flasks", "hybrid")
+
+       p = FlasksParser(True)
+       r = requests.get("https://pathofexile.gamepedia.com/Utility_Flasks")
+       p.feed(codecs.decode(r.content, r.encoding))
+       r = requests.get("https://pathofexile.gamepedia.com/Critical_Utility_Flasks")
+       p.feed(codecs.decode(r.content, r.encoding))
+
+       out.write('category "flask.utility"\n{\n\tclass "Flask";\n\tor\n\t{\n')
+       for it in p.items:
+               out.write('\t\tbase_type "{}";\n'.format(it.name))
+       out.write("\t};\n};\n")
+
+if __name__=="__main__":
+       main()