]> git.tdb.fi Git - poefilter.git/blob - scrape-wiki-data.py
Minor tweaks to the item scraping script
[poefilter.git] / scrape-wiki-data.py
1 #!/usr/bin/python3
2
3 import requests
4 import html.parser
5 import codecs
6
7 class Card:
8         def __init__(self, name):
9                 self.name = name
10                 self.reward_kind = None
11
12 class Flask:
13         def __init__(self, name):
14                 self.name = name
15                 self.droplevel = 0
16                 self.amount = 0.0
17                 self.duration = 0.0
18                 self.upgrade = None
19
20         def is_upgrade_for(self, other):
21                 aps = self.amount/self.duration
22                 other_aps = other.amount/other.duration
23                 return (aps>other_aps and self.amount>other.amount)
24
25 class WikiParser(html.parser.HTMLParser):
26         def __init__(self):
27                 super(WikiParser, self).__init__()
28
29                 self.in_items_table = False
30                 self.column = 0
31                 self.in_cell = False
32                 self.ignore_data = 0
33                 self.items = []
34                 self.current_item = None
35
36         def handle_starttag(self, tag, attrs):
37                 if tag=="table":
38                         for n, v in attrs:
39                                 if n=="class" and "wikitable" in v:
40                                         self.in_items_table = True
41                 elif tag=="tr":
42                         self.column = 0
43                         self.current_item = None
44                 elif tag=="td":
45                         self.column += 1
46                         if self.in_items_table:
47                                 self.in_cell = True
48                 elif tag=="span":
49                         if self.ignore_data:
50                                 self.ignore_data += 1
51                         else:
52                                 for n, v in attrs:
53                                         if n=="class" and "c-item-hoverbox__display" in v:
54                                                 self.ignore_data = 1
55
56         def handle_endtag(self, tag):
57                 if tag=="table":
58                         self.in_items_table = False
59                 elif tag=="td":
60                         self.in_cell = False
61                 elif tag=="span":
62                         if self.ignore_data:
63                                 self.ignore_data -= 1
64
65         def handle_data(self, data):
66                 if self.ignore_data:
67                         return
68
69                 data = data.strip()
70                 if not data:
71                         return
72
73                 if self.in_cell:
74                         if self.column==1:
75                                 self.current_item = self.create_item(data)
76                                 if self.current_item:
77                                         self.items.append(self.current_item)
78                         elif self.current_item:
79                                 self.handle_value(self.column, data)
80
81         def create_item(self, name):
82                 pass
83
84         def handle_value(self, column, data):
85                 pass
86
87 class DivinationCardsParser(WikiParser):
88         def handle_starttag(self, tag, attrs):
89                 super(DivinationCardsParser, self).handle_starttag(tag, attrs)
90
91                 if tag=="span" and not self.ignore_data:
92                         if self.in_cell and self.current_item and not self.current_item.reward_kind and self.column==3:
93                                 for n, v in attrs:
94                                         if n=="class":
95                                                 if "-currency" in v:
96                                                         self.current_item.reward_kind = "currency"
97                                                 elif "-unique" in v:
98                                                         self.current_item.reward_kind = "unique"
99                                                 elif "-rare" in v:
100                                                         self.current_item.reward_kind = "rare"
101                                                 elif "-magic" in v or "-mod" in v:
102                                                         self.current_item.reward_kind = "magic"
103                                                 elif "-gem" in v:
104                                                         self.current_item.reward_kind = "skillgem"
105
106         def handle_endtag(self, tag):
107                 super(DivinationCardsParser, self).handle_endtag(tag)
108
109                 if tag=="tr":
110                         if self.current_item and not self.current_item.reward_kind:
111                                 self.current_item.reward_kind = "other"
112
113         def create_item(self, name):
114                 return Card(name)
115
116         def handle_value(self, column, data):
117                 if column==3:
118                         if "-Link" in data:
119                                 self.current_item.reward_kind = "links"
120                         elif "Map" in data:
121                                 self.current_item.reward_kind = "map"
122
123 class FlasksParser(WikiParser):
124         def __init__(self, utility=False):
125                 super(FlasksParser, self).__init__()
126
127                 self.utility = utility
128
129         def handle_endtag(self, tag):
130                 super(FlasksParser, self).handle_endtag(tag)
131
132                 if tag=="tr":
133                         if self.current_item:
134                                 for it in self.items:
135                                         if not it.upgrade and self.current_item.is_upgrade_for(it):
136                                                 it.upgrade = self.current_item
137
138         def create_item(self, name):
139                 if name.endswith("Flask"):
140                         return Flask(name)
141
142         def handle_value(self, column, data):
143                 if column==2:
144                         self.current_item.droplevel = int(data)
145                 elif column==3 and not self.utility:
146                         self.current_item.amount = int(data)
147                 elif (column==4 and not self.utility) or (column==3 and self.utility):
148                         self.current_item.duration = float(data)
149
150 def scrape_flasks(out, url, kind):
151         r = requests.get(url)
152         p = FlasksParser()
153         p.feed(codecs.decode(r.content, r.encoding))
154
155         out.write('category "flask.{}.best_at_level"\n'.format(kind))
156         out.write('{\n\tclass "Flask";\n\tor\n\t{\n')
157         for it in p.items:
158                 out.write("\t\tand\n\t\t{\n")
159                 out.write('\t\t\tbase_type "{}";\n'.format(it.name))
160                 if it.upgrade:
161                         out.write("\t\t\titem_level {} {};\n".format(it.droplevel, it.upgrade.droplevel-1))
162                 else:
163                         out.write("\t\t\tmin_item_level {};\n".format(it.droplevel))
164                 out.write("\t\t};\n")
165         out.write("\t};\n};\n")
166
167 def main():
168         r = requests.get("https://pathofexile.gamepedia.com/List_of_divination_cards")
169         p = DivinationCardsParser()
170         p.feed(codecs.decode(r.content, r.encoding))
171
172         by_reward = {}
173
174         for it in p.items:
175                 by_reward.setdefault(it.reward_kind, []).append(it)
176
177         out = open("cards.txt", "w")
178         for r, il in by_reward.items():
179                 out.write('category "card.{}"\n'.format(r))
180                 out.write('{\n\tclass "Card";\n\tor\n\t{\n')
181                 for it in il:
182                         out.write('\t\tbase_type "{}";\n'.format(it.name))
183                 out.write("\t};\n};\n")
184
185         out = open("flasks.txt", "w")
186         scrape_flasks(out, "https://pathofexile.gamepedia.com/Life_Flasks", "life")
187         scrape_flasks(out, "https://pathofexile.gamepedia.com/Mana_Flasks", "mana")
188         scrape_flasks(out, "https://pathofexile.gamepedia.com/Hybrid_Flasks", "hybrid")
189
190         p = FlasksParser(True)
191         r = requests.get("https://pathofexile.gamepedia.com/Utility_Flasks")
192         p.feed(codecs.decode(r.content, r.encoding))
193         r = requests.get("https://pathofexile.gamepedia.com/Critical_Utility_Flasks")
194         p.feed(codecs.decode(r.content, r.encoding))
195
196         out.write('category "flask.utility"\n{\n\tclass "Flask";\n\tor\n\t{\n')
197         for it in p.items:
198                 out.write('\t\tbase_type "{}";\n'.format(it.name))
199         out.write("\t};\n};\n")
200
201 if __name__=="__main__":
202         main()