This repository was archived by the owner on Sep 16, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 19
Expand file tree
/
Copy pathmain.py
More file actions
83 lines (65 loc) · 2.44 KB
/
main.py
File metadata and controls
83 lines (65 loc) · 2.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from log import info,warning,error,success
import trio
import asks
asks.init('trio')
class spider(object):
def __init__(self, *args, **kwargs):
self.datas = {}
async def getTypesL1(self):
"""
取得一级分类
"""
url = "https://developers.whatismybrowser.com/useragents/explore/"
resp = await spiderSession.get(url)
async with trio.open_nursery() as nursery:
for item in jq(resp.text)("#listing-by-field-name > li > h2 > a").items():
types = item.text().strip().replace(' ', '_').lower()
POOLS[types] = {}
nursery.start_soon(
getTypesL2, POOLS[types], types, urljoin(url, item.attr('href')))
async def getTypesL2(target, types, href):
"""
取得二级分类
"""
loger.info(colored(f'fetching {href}', 'yellow'))
resp = await spiderSession.get(href)
async with trio.open_nursery() as nursery:
for item in jq(resp.text)("body > div.content-base > section > div > table > tbody > tr").items():
name = item(
'td:nth-child(1)>a').text().strip().replace(' ', '_').lower()
target[name] = {}
url = urljoin(href, item('td:nth-child(1)>a').attr('href'))
nums = int(item('td:nth-child(2)').text().strip())
target[name]['url'] = url
target[name]['nums'] = nums
target[name]['UA_list'] = []
for page in range(1, math.ceil(nums/PERPAGE)+1):
TASKS.add('__'.join([
types,
name,
f"{url}{page}"
]))
async def getUAs():
global MAXNUMS
"""
爬行任务调度
"""
limit = trio.CapacityLimiter(LIMIT)
while TASKS:
MAXNUMS = len(list(TASKS))
loger.info(colored(f'当前任务量:{MAXNUMS}', 'red'))
await trio.sleep(1)
async with trio.open_nursery() as nursery:
for item in list(TASKS):
nursery.start_soon(getUAsitem, item, limit)
def run(self):
pass
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
pass
except Exception as e:
loger.error(colored(e, 'red'))
finally:
SaveJson(POOLS, 'POOLS.json')