]> git.tdb.fi Git - poefilter.git/blob - scrape-wiki-data.py
Correctly merge icon and light beam appearances
[poefilter.git] / scrape-wiki-data.py
1 #!/usr/bin/python3
2
3 import sys
4 import requests
5 import html.parser
6 import codecs
7
8 class Card:
9         def __init__(self, name):
10                 self.name = name
11                 self.reward_kind = None
12
13 class Flask:
14         def __init__(self, name):
15                 self.name = name
16                 self.droplevel = 0
17                 self.amount = 0.0
18                 self.duration = 0.0
19                 self.upgrade = None
20
21         def is_upgrade_for(self, other):
22                 aps = self.amount/self.duration
23                 other_aps = other.amount/other.duration
24                 return (aps>other_aps and self.amount>other.amount)
25
26 class SkillGem:
27         def __init__(self, name):
28                 self.name = name
29                 self.primary_attribute = None
30                 self.secondary_attribute = None
31                 self.price = None
32                 self.vaal = False
33                 self.drop_only = False
34
35 class WikiParser(html.parser.HTMLParser):
36         def __init__(self):
37                 super(WikiParser, self).__init__()
38
39                 self.ignore = 0
40
41         def handle_starttag(self, tag, attrs):
42                 if tag=="span":
43                         if self.ignore:
44                                 self.ignore += 1
45                         else:
46                                 for n, v in attrs:
47                                         if n=="class" and "c-item-hoverbox__display" in v:
48                                                 self.ignore = 1
49
50         def handle_endtag(self, tag):
51                 if tag=="span":
52                         if self.ignore:
53                                 self.ignore -= 1
54
55 class WikiTableParser(WikiParser):
56         def __init__(self):
57                 super(WikiTableParser, self).__init__()
58
59                 self.in_items_table = False
60                 self.column = 0
61                 self.in_cell = False
62                 self.items = []
63                 self.current_item = None
64
65         def handle_starttag(self, tag, attrs):
66                 super(WikiTableParser, self).handle_starttag(tag, attrs)
67
68                 if tag=="table":
69                         for n, v in attrs:
70                                 if n=="class" and "wikitable" in v:
71                                         self.in_items_table = True
72                 elif tag=="tr":
73                         self.column = 0
74                         self.current_item = None
75                 elif tag=="td":
76                         self.column += 1
77                         if self.in_items_table:
78                                 self.in_cell = True
79
80         def handle_endtag(self, tag):
81                 super(WikiTableParser, self).handle_endtag(tag)
82
83                 if tag=="table":
84                         self.in_items_table = False
85                 elif tag=="td":
86                         self.in_cell = False
87
88         def handle_data(self, data):
89                 if self.ignore:
90                         return
91
92                 data = data.strip()
93                 if not data:
94                         return
95
96                 if self.in_cell:
97                         if self.column==1:
98                                 self.current_item = self.create_item(data)
99                                 if self.current_item:
100                                         self.items.append(self.current_item)
101                         elif self.current_item:
102                                 self.handle_value(self.column, data)
103
104         def create_item(self, name):
105                 pass
106
107         def handle_value(self, column, data):
108                 pass
109
110 class DivinationCardsParser(WikiTableParser):
111         reward_kinds = ("currency", "map", "unique", "links", "skillgem", "rare", "magic", "other")
112
113         def handle_starttag(self, tag, attrs):
114                 super(DivinationCardsParser, self).handle_starttag(tag, attrs)
115
116                 if tag=="span" and not self.ignore:
117                         if self.in_cell and self.current_item and not self.current_item.reward_kind and self.column==3:
118                                 for n, v in attrs:
119                                         if n=="class":
120                                                 if "-currency" in v:
121                                                         self.current_item.reward_kind = "currency"
122                                                 elif "-unique" in v:
123                                                         self.current_item.reward_kind = "unique"
124                                                 elif "-rare" in v:
125                                                         self.current_item.reward_kind = "rare"
126                                                 elif "-magic" in v or "-mod" in v:
127                                                         self.current_item.reward_kind = "magic"
128                                                 elif "-gem" in v:
129                                                         self.current_item.reward_kind = "skillgem"
130
131         def handle_endtag(self, tag):
132                 super(DivinationCardsParser, self).handle_endtag(tag)
133
134                 if tag=="tr":
135                         if self.current_item and not self.current_item.reward_kind:
136                                 self.current_item.reward_kind = "other"
137
138         def create_item(self, name):
139                 return Card(name)
140
141         def handle_value(self, column, data):
142                 if column==3:
143                         if "-Link" in data:
144                                 self.current_item.reward_kind = "links"
145                         elif "Map" in data:
146                                 self.current_item.reward_kind = "map"
147
148 class FlasksParser(WikiTableParser):
149         def __init__(self, utility=False):
150                 super(FlasksParser, self).__init__()
151
152                 self.utility = utility
153
154         def handle_endtag(self, tag):
155                 super(FlasksParser, self).handle_endtag(tag)
156
157                 if tag=="tr":
158                         if self.current_item:
159                                 for it in self.items:
160                                         if not it.upgrade and self.current_item.is_upgrade_for(it):
161                                                 it.upgrade = self.current_item
162
163         def create_item(self, name):
164                 if name.endswith("Flask"):
165                         return Flask(name)
166
167         def handle_value(self, column, data):
168                 if column==2:
169                         self.current_item.droplevel = int(data)
170                 elif column==3 and not self.utility:
171                         self.current_item.amount = int(data)
172                 elif (column==4 and not self.utility) or (column==3 and self.utility):
173                         self.current_item.duration = float(data)
174
175 class SkillGemListParser(WikiParser):
176         def __init__(self):
177                 super(SkillGemListParser, self).__init__()
178
179                 self.in_subheading = False
180                 self.in_list = False
181                 self.links = []
182
183         def handle_starttag(self, tag, attrs):
184                 super(SkillGemListParser, self).handle_starttag(tag, attrs)
185
186                 if tag=="h2":
187                         self.in_subheading = True
188                 elif tag=="a":
189                         if self.in_list and not self.ignore:
190                                 for n, v in attrs:
191                                         if n=="href":
192                                                 self.links.append(v)
193
194         def handle_endtag(self, tag):
195                 super(SkillGemListParser, self).handle_endtag(tag)
196
197                 if tag=="h2":
198                         self.in_subheading = False
199
200         def handle_data(self, data):
201                 data = data.strip()
202
203                 if self.in_subheading:
204                         self.in_list = (data=="List")
205
206 class SkillGemParser(WikiParser):
207         prices = (("alchemy", "Orb of Alchemy"),
208                 ("chance", "Orb of Chance"),
209                 ("alteration", "Orb of Alteration"),
210                 ("transmute", "Orb of Transmutation"),
211                 ("wisdom", "Scroll of Wisdom"))
212
213         def __init__(self):
214                 super(SkillGemParser, self).__init__()
215
216                 self.item = None
217                 self.in_heading = False
218                 self.in_subheading = False
219                 self.in_subheading = False
220                 self.in_infobox = 0
221                 self.in_box_heading = False
222                 self.in_purchase = False
223                 self.in_progression = False
224                 self.in_acquisition = False
225                 self.have_acquisition_data = False
226                 self.row = 0
227                 self.column = 0
228                 self.attribute_reqs = []
229
230         def handle_starttag(self, tag, attrs):
231                 super(SkillGemParser, self).handle_starttag(tag, attrs)
232
233                 if tag=="h1":
234                         self.in_heading = True
235                 elif tag=="h2":
236                         self.in_subheading = True
237                 elif tag=="span":
238                         if self.in_infobox:
239                                 self.in_infobox += 1
240                         else:
241                                 for n, v in attrs:
242                                         if n=="class" and "item-box" in v:
243                                                 self.in_infobox = 1
244                 elif tag=="em":
245                         if self.in_infobox:
246                                 for n, v in attrs:
247                                         if n=="class" and "header" in v:
248                                                 self.in_purchase = False
249                                                 self.in_box_heading = True
250                 elif tag=="table":
251                         self.row = 0
252                 elif tag=="tr":
253                         self.row += 1
254                         self.column = 0
255                 elif tag=="td" or tag=="th":
256                         self.column += 1
257                 elif tag=="img":
258                         if self.in_progression and self.column>=3:
259                                 for n, v in attrs:
260                                         if n=="alt":
261                                                 self.attribute_reqs.append([v, 0])
262                 elif tag=="div":
263                         for n, v in attrs:
264                                 if n=="id" and v=="footer":
265                                         if not self.have_acquisition_data:
266                                                 self.item.drop_only = True
267
268         def handle_endtag(self, tag):
269                 super(SkillGemParser, self).handle_endtag(tag)
270
271                 if tag=="h1":
272                         self.in_heading = False
273                 elif tag=="h2":
274                         self.in_subheading = False
275                 elif tag=="span":
276                         if self.in_infobox:
277                                 self.in_infobox -= 1
278                 elif tag=="em":
279                         self.in_box_heading = False
280                 elif tag=="tr":
281                         if self.in_progression and self.row==2 and self.attribute_reqs:
282                                 self.attribute_reqs.sort(key=lambda r: r[1])
283                                 self.item.primary_attribute = self.attribute_reqs[0][0]
284                                 if len(self.attribute_reqs)>1:
285                                         self.item.secondary_attribute = self.attribute_reqs[1][0]
286
287         def handle_data(self, data):
288                 data = data.strip()
289                 if not data:
290                         return
291
292                 if self.in_heading:
293                         name = data
294                         paren = name.find('(')
295                         if paren>=0:
296                                 name = name[:paren].strip()
297                         self.item = SkillGem(name)
298                         if name.startswith("Vaal"):
299                                 self.item.vaal = True
300                 elif self.in_subheading:
301                         self.in_progression = ("progression" in data)
302                         self.in_acquisition = ("acquisition" in data)
303                 elif self.in_box_heading:
304                         if "Purchase" in data:
305                                 self.in_purchase = True
306                 elif self.in_purchase:
307                         if self.column==2:
308                                 for p, n in SkillGemParser.prices:
309                                         if n==data:
310                                                 self.item.price = p
311                 elif self.in_progression:
312                         if self.row==2 and self.column>=3 and self.column<3+len(self.attribute_reqs):
313                                 self.attribute_reqs[self.column-3][1] = int(data)
314                 elif self.in_acquisition:
315                         self.have_acquisition_data = True
316
317 def scrape_flasks(out, url, kind):
318         r = requests.get(url)
319         p = FlasksParser()
320         p.feed(codecs.decode(r.content, r.encoding))
321
322         out.write('category "flask.{}.best_at_level"\n'.format(kind))
323         out.write('{\n\tclass "Flask";\n\tor\n\t{\n')
324         for it in p.items:
325                 out.write("\t\tand\n\t\t{\n")
326                 out.write('\t\t\tbase_type "{}";\n'.format(it.name))
327                 if it.upgrade:
328                         out.write("\t\t\titem_level {} {};\n".format(it.droplevel, it.upgrade.droplevel-1))
329                 else:
330                         out.write("\t\t\tmin_item_level {};\n".format(it.droplevel))
331                 out.write("\t\t};\n")
332         out.write("\t};\n};\n")
333
334 def main():
335         r = requests.get("https://pathofexile.gamepedia.com/List_of_divination_cards")
336         p = DivinationCardsParser()
337         p.feed(codecs.decode(r.content, r.encoding))
338
339         by_reward = {}
340
341         for it in p.items:
342                 by_reward.setdefault(it.reward_kind, []).append(it)
343
344         out = open("cards.txt", "w")
345         for r in DivinationCardsParser.reward_kinds:
346                 out.write('category "card.{}"\n'.format(r))
347                 out.write('{\n\tclass "Card";\n\tor\n\t{\n')
348                 for it in by_reward[r]:
349                         out.write('\t\tbase_type "{}";\n'.format(it.name))
350                 out.write("\t};\n};\n")
351
352         out = open("flasks.txt", "w")
353         scrape_flasks(out, "https://pathofexile.gamepedia.com/Life_Flasks", "life")
354         scrape_flasks(out, "https://pathofexile.gamepedia.com/Mana_Flasks", "mana")
355         scrape_flasks(out, "https://pathofexile.gamepedia.com/Hybrid_Flasks", "hybrid")
356
357         p = FlasksParser(True)
358         r = requests.get("https://pathofexile.gamepedia.com/Utility_Flasks")
359         p.feed(codecs.decode(r.content, r.encoding))
360         r = requests.get("https://pathofexile.gamepedia.com/Critical_Utility_Flasks")
361         p.feed(codecs.decode(r.content, r.encoding))
362
363         out.write('category "flask.utility"\n{\n\tclass "Flask";\n\tor\n\t{\n')
364         for it in p.items:
365                 out.write('\t\tbase_type "{}";\n'.format(it.name))
366         out.write("\t};\n};\n")
367
368         r = requests.get("https://pathofexile.gamepedia.com/List_of_skill_gems")
369         p = SkillGemListParser()
370         p.feed(codecs.decode(r.content, r.encoding))
371
372         gems = []
373
374         prefix = ""
375         for l in p.links:
376                 r2 = requests.get("https://pathofexile.gamepedia.com"+l)
377                 p2 = SkillGemParser()
378                 p2.feed(codecs.decode(r2.content, r2.encoding))
379                 gems.append(p2.item)
380
381                 sys.stdout.write(prefix+"{}\n".format(p2.item.name))
382                 prefix = "\033[1A\033[K"
383
384         sys.stdout.write(prefix)
385
386         out = open("skillgems.txt", "w")
387         out.write('category "skillgem.special.vaal"\n{\n\tclass "Skill Gem";\n\tor\n\t{\n')
388         for g in gems:
389                 if g.vaal:
390                         out.write('\t\tbase_type "{}";\n'.format(g.name))
391         out.write("\t};\n};\n")
392
393         out.write('category "skillgem.special.drop"\n{\n\tclass "Skill Gem";\n\tor\n\t{\n')
394         for g in gems:
395                 if (g.drop_only or not g.price) and not g.vaal:
396                         out.write('\t\tbase_type "{}";\n'.format(g.name))
397         out.write("\t};\n};\n")
398
399         for p, n in SkillGemParser.prices:
400                 out.write('category "skillgem.price.{}"\n'.format(p))
401                 out.write('{\n\tclass "Skill Gem";\n\tor\n\t{\n')
402                 for g in gems:
403                         if not g.drop_only and g.price==p:
404                                 out.write('\t\tbase_type "{}";\n'.format(g.name))
405                 out.write("\t};\n};\n")
406
407         for a in ["strength", "dexterity", "intelligence"]:
408                 out.write('category "skillgem.attribute.{}"\n'.format(a))
409                 out.write('{\n\tclass "Skill Gem";\n\tor\n\t{\n')
410                 for g in gems:
411                         if g.primary_attribute==a:
412                                 out.write('\t\tbase_type "{}";\n'.format(g.name))
413                 out.write("\t};\n};\n")
414
415 if __name__=="__main__":
416         main()