]> git.tdb.fi Git - poefilter.git/blob - scrape-wiki-data.py
Some tweaks to skill gem scraping
[poefilter.git] / scrape-wiki-data.py
1 #!/usr/bin/python3
2
3 import sys
4 import requests
5 import html.parser
6 import codecs
7
8 class Card:
9         def __init__(self, name):
10                 self.name = name
11                 self.reward_kind = None
12
13 class Flask:
14         def __init__(self, name):
15                 self.name = name
16                 self.droplevel = 0
17                 self.amount = 0.0
18                 self.duration = 0.0
19                 self.upgrade = None
20
21         def is_upgrade_for(self, other):
22                 aps = self.amount/self.duration
23                 other_aps = other.amount/other.duration
24                 return (aps>other_aps and self.amount>other.amount)
25
26 class SkillGem:
27         def __init__(self, name):
28                 self.name = name
29                 self.primary_attribute = None
30                 self.secondary_attribute = None
31                 self.price = None
32                 self.vaal = False
33                 self.drop_only = False
34
35 class WikiParser(html.parser.HTMLParser):
36         def __init__(self):
37                 super(WikiParser, self).__init__()
38
39                 self.ignore = 0
40
41         def handle_starttag(self, tag, attrs):
42                 if tag=="span":
43                         if self.ignore:
44                                 self.ignore += 1
45                         else:
46                                 for n, v in attrs:
47                                         if n=="class" and "c-item-hoverbox__display" in v:
48                                                 self.ignore = 1
49
50         def handle_endtag(self, tag):
51                 if tag=="span":
52                         if self.ignore:
53                                 self.ignore -= 1
54
55 class WikiTableParser(WikiParser):
56         def __init__(self):
57                 super(WikiTableParser, self).__init__()
58
59                 self.in_items_table = False
60                 self.column = 0
61                 self.in_cell = False
62                 self.items = []
63                 self.current_item = None
64
65         def handle_starttag(self, tag, attrs):
66                 super(WikiTableParser, self).handle_starttag(tag, attrs)
67
68                 if tag=="table":
69                         for n, v in attrs:
70                                 if n=="class" and "wikitable" in v:
71                                         self.in_items_table = True
72                 elif tag=="tr":
73                         self.column = 0
74                         self.current_item = None
75                 elif tag=="td":
76                         self.column += 1
77                         if self.in_items_table:
78                                 self.in_cell = True
79
80         def handle_endtag(self, tag):
81                 super(WikiTableParser, self).handle_endtag(tag)
82
83                 if tag=="table":
84                         self.in_items_table = False
85                 elif tag=="td":
86                         self.in_cell = False
87
88         def handle_data(self, data):
89                 if self.ignore:
90                         return
91
92                 data = data.strip()
93                 if not data:
94                         return
95
96                 if self.in_cell:
97                         if self.column==1:
98                                 self.current_item = self.create_item(data)
99                                 if self.current_item:
100                                         self.items.append(self.current_item)
101                         elif self.current_item:
102                                 self.handle_value(self.column, data)
103
104         def create_item(self, name):
105                 pass
106
107         def handle_value(self, column, data):
108                 pass
109
110 class DivinationCardsParser(WikiTableParser):
111         def handle_starttag(self, tag, attrs):
112                 super(DivinationCardsParser, self).handle_starttag(tag, attrs)
113
114                 if tag=="span" and not self.ignore:
115                         if self.in_cell and self.current_item and not self.current_item.reward_kind and self.column==3:
116                                 for n, v in attrs:
117                                         if n=="class":
118                                                 if "-currency" in v:
119                                                         self.current_item.reward_kind = "currency"
120                                                 elif "-unique" in v:
121                                                         self.current_item.reward_kind = "unique"
122                                                 elif "-rare" in v:
123                                                         self.current_item.reward_kind = "rare"
124                                                 elif "-magic" in v or "-mod" in v:
125                                                         self.current_item.reward_kind = "magic"
126                                                 elif "-gem" in v:
127                                                         self.current_item.reward_kind = "skillgem"
128
129         def handle_endtag(self, tag):
130                 super(DivinationCardsParser, self).handle_endtag(tag)
131
132                 if tag=="tr":
133                         if self.current_item and not self.current_item.reward_kind:
134                                 self.current_item.reward_kind = "other"
135
136         def create_item(self, name):
137                 return Card(name)
138
139         def handle_value(self, column, data):
140                 if column==3:
141                         if "-Link" in data:
142                                 self.current_item.reward_kind = "links"
143                         elif "Map" in data:
144                                 self.current_item.reward_kind = "map"
145
146 class FlasksParser(WikiTableParser):
147         def __init__(self, utility=False):
148                 super(FlasksParser, self).__init__()
149
150                 self.utility = utility
151
152         def handle_endtag(self, tag):
153                 super(FlasksParser, self).handle_endtag(tag)
154
155                 if tag=="tr":
156                         if self.current_item:
157                                 for it in self.items:
158                                         if not it.upgrade and self.current_item.is_upgrade_for(it):
159                                                 it.upgrade = self.current_item
160
161         def create_item(self, name):
162                 if name.endswith("Flask"):
163                         return Flask(name)
164
165         def handle_value(self, column, data):
166                 if column==2:
167                         self.current_item.droplevel = int(data)
168                 elif column==3 and not self.utility:
169                         self.current_item.amount = int(data)
170                 elif (column==4 and not self.utility) or (column==3 and self.utility):
171                         self.current_item.duration = float(data)
172
173 class SkillGemListParser(WikiParser):
174         def __init__(self):
175                 super(SkillGemListParser, self).__init__()
176
177                 self.in_subheading = False
178                 self.in_list = False
179                 self.links = []
180
181         def handle_starttag(self, tag, attrs):
182                 super(SkillGemListParser, self).handle_starttag(tag, attrs)
183
184                 if tag=="h2":
185                         self.in_subheading = True
186                 elif tag=="a":
187                         if self.in_list and not self.ignore:
188                                 for n, v in attrs:
189                                         if n=="href":
190                                                 self.links.append(v)
191
192         def handle_endtag(self, tag):
193                 super(SkillGemListParser, self).handle_endtag(tag)
194
195                 if tag=="h2":
196                         self.in_subheading = False
197
198         def handle_data(self, data):
199                 data = data.strip()
200
201                 if self.in_subheading:
202                         self.in_list = (data=="List")
203
204 class SkillGemParser(WikiParser):
205         prices = (("alchemy", "Orb of Alchemy"),
206                 ("chance", "Orb of Chance"),
207                 ("alteration", "Orb of Alteration"),
208                 ("transmute", "Orb of Transmutation"),
209                 ("wisdom", "Scroll of Wisdom"))
210
211         def __init__(self):
212                 super(SkillGemParser, self).__init__()
213
214                 self.item = None
215                 self.in_heading = False
216                 self.in_subheading = False
217                 self.in_subheading = False
218                 self.in_infobox = 0
219                 self.in_box_heading = False
220                 self.in_purchase = False
221                 self.in_progression = False
222                 self.in_acquisition = False
223                 self.have_acquisition_data = False
224                 self.row = 0
225                 self.column = 0
226                 self.attribute_reqs = []
227
228         def handle_starttag(self, tag, attrs):
229                 super(SkillGemParser, self).handle_starttag(tag, attrs)
230
231                 if tag=="h1":
232                         self.in_heading = True
233                 elif tag=="h2":
234                         self.in_subheading = True
235                 elif tag=="span":
236                         if self.in_infobox:
237                                 self.in_infobox += 1
238                         else:
239                                 for n, v in attrs:
240                                         if n=="class" and "item-box" in v:
241                                                 self.in_infobox = 1
242                 elif tag=="em":
243                         if self.in_infobox:
244                                 for n, v in attrs:
245                                         if n=="class" and "header" in v:
246                                                 self.in_purchase = False
247                                                 self.in_box_heading = True
248                 elif tag=="table":
249                         self.row = 0
250                 elif tag=="tr":
251                         self.row += 1
252                         self.column = 0
253                 elif tag=="td" or tag=="th":
254                         self.column += 1
255                 elif tag=="img":
256                         if self.in_progression and self.column>=3:
257                                 for n, v in attrs:
258                                         if n=="alt":
259                                                 self.attribute_reqs.append([v, 0])
260                 elif tag=="div":
261                         for n, v in attrs:
262                                 if n=="id" and v=="footer":
263                                         if not self.have_acquisition_data:
264                                                 self.item.drop_only = True
265
266         def handle_endtag(self, tag):
267                 super(SkillGemParser, self).handle_endtag(tag)
268
269                 if tag=="h1":
270                         self.in_heading = False
271                 elif tag=="h2":
272                         self.in_subheading = False
273                 elif tag=="span":
274                         if self.in_infobox:
275                                 self.in_infobox -= 1
276                 elif tag=="em":
277                         self.in_box_heading = False
278                 elif tag=="tr":
279                         if self.in_progression and self.row==2 and self.attribute_reqs:
280                                 self.attribute_reqs.sort(key=lambda r: r[1])
281                                 self.item.primary_attribute = self.attribute_reqs[0][0]
282                                 if len(self.attribute_reqs)>1:
283                                         self.item.secondary_attribute = self.attribute_reqs[1][0]
284
285         def handle_data(self, data):
286                 data = data.strip()
287                 if not data:
288                         return
289
290                 if self.in_heading:
291                         name = data
292                         paren = name.find('(')
293                         if paren>=0:
294                                 name = name[:paren].strip()
295                         self.item = SkillGem(name)
296                         if name.startswith("Vaal"):
297                                 self.item.vaal = True
298                 elif self.in_subheading:
299                         self.in_progression = ("progression" in data)
300                         self.in_acquisition = ("acquisition" in data)
301                 elif self.in_box_heading:
302                         if "Purchase" in data:
303                                 self.in_purchase = True
304                 elif self.in_purchase:
305                         if self.column==2:
306                                 for p, n in SkillGemParser.prices:
307                                         if n==data:
308                                                 self.item.price = p
309                 elif self.in_progression:
310                         if self.row==2 and self.column>=3 and self.column<3+len(self.attribute_reqs):
311                                 self.attribute_reqs[self.column-3][1] = int(data)
312                 elif self.in_acquisition:
313                         self.have_acquisition_data = True
314
315 def scrape_flasks(out, url, kind):
316         r = requests.get(url)
317         p = FlasksParser()
318         p.feed(codecs.decode(r.content, r.encoding))
319
320         out.write('category "flask.{}.best_at_level"\n'.format(kind))
321         out.write('{\n\tclass "Flask";\n\tor\n\t{\n')
322         for it in p.items:
323                 out.write("\t\tand\n\t\t{\n")
324                 out.write('\t\t\tbase_type "{}";\n'.format(it.name))
325                 if it.upgrade:
326                         out.write("\t\t\titem_level {} {};\n".format(it.droplevel, it.upgrade.droplevel-1))
327                 else:
328                         out.write("\t\t\tmin_item_level {};\n".format(it.droplevel))
329                 out.write("\t\t};\n")
330         out.write("\t};\n};\n")
331
332 def main():
333         r = requests.get("https://pathofexile.gamepedia.com/List_of_divination_cards")
334         p = DivinationCardsParser()
335         p.feed(codecs.decode(r.content, r.encoding))
336
337         by_reward = {}
338
339         for it in p.items:
340                 by_reward.setdefault(it.reward_kind, []).append(it)
341
342         out = open("cards.txt", "w")
343         for r, il in by_reward.items():
344                 out.write('category "card.{}"\n'.format(r))
345                 out.write('{\n\tclass "Card";\n\tor\n\t{\n')
346                 for it in il:
347                         out.write('\t\tbase_type "{}";\n'.format(it.name))
348                 out.write("\t};\n};\n")
349
350         out = open("flasks.txt", "w")
351         scrape_flasks(out, "https://pathofexile.gamepedia.com/Life_Flasks", "life")
352         scrape_flasks(out, "https://pathofexile.gamepedia.com/Mana_Flasks", "mana")
353         scrape_flasks(out, "https://pathofexile.gamepedia.com/Hybrid_Flasks", "hybrid")
354
355         p = FlasksParser(True)
356         r = requests.get("https://pathofexile.gamepedia.com/Utility_Flasks")
357         p.feed(codecs.decode(r.content, r.encoding))
358         r = requests.get("https://pathofexile.gamepedia.com/Critical_Utility_Flasks")
359         p.feed(codecs.decode(r.content, r.encoding))
360
361         out.write('category "flask.utility"\n{\n\tclass "Flask";\n\tor\n\t{\n')
362         for it in p.items:
363                 out.write('\t\tbase_type "{}";\n'.format(it.name))
364         out.write("\t};\n};\n")
365
366         r = requests.get("https://pathofexile.gamepedia.com/List_of_skill_gems")
367         p = SkillGemListParser()
368         p.feed(codecs.decode(r.content, r.encoding))
369
370         gems = []
371
372         prefix = ""
373         for l in p.links:
374                 r2 = requests.get("https://pathofexile.gamepedia.com"+l)
375                 p2 = SkillGemParser()
376                 p2.feed(codecs.decode(r2.content, r2.encoding))
377                 gems.append(p2.item)
378
379                 sys.stdout.write(prefix+"{}\n".format(p2.item.name))
380                 prefix = "\033[1A\033[K"
381
382         sys.stdout.write(prefix)
383
384         out = open("skillgems.txt", "w")
385         out.write('category "skillgem.special.vaal"\n{\n\tclass "Skill Gem";\n\tor\n\t{\n')
386         for g in gems:
387                 if g.vaal:
388                         out.write('\t\tbase_type "{}";\n'.format(g.name))
389         out.write("\t};\n};\n")
390
391         out.write('category "skillgem.special.drop"\n{\n\tclass "Skill Gem";\n\tor\n\t{\n')
392         for g in gems:
393                 if (g.drop_only or not g.price) and not g.vaal:
394                         out.write('\t\tbase_type "{}";\n'.format(g.name))
395         out.write("\t};\n};\n")
396
397         for p, n in SkillGemParser.prices:
398                 out.write('category "skillgem.price.{}"\n'.format(p))
399                 out.write('{\n\tclass "Skill Gem";\n\tor\n\t{\n')
400                 for g in gems:
401                         if not g.drop_only and g.price==p:
402                                 out.write('\t\tbase_type "{}";\n'.format(g.name))
403                 out.write("\t};\n};\n")
404
405         for a in ["strength", "dexterity", "intelligence"]:
406                 out.write('category "skillgem.attribute.{}"\n'.format(a))
407                 out.write('{\n\tclass "Skill Gem";\n\tor\n\t{\n')
408                 for g in gems:
409                         if g.primary_attribute==a:
410                                 out.write('\t\tbase_type "{}";\n'.format(g.name))
411                 out.write("\t};\n};\n")
412
413 if __name__=="__main__":
414         main()