-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprescrape.py
More file actions
executable file
·53 lines (45 loc) · 1.25 KB
/
prescrape.py
File metadata and controls
executable file
·53 lines (45 loc) · 1.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/env python
import sys
import psycopg2
from bs4 import BeautifulSoup # type: ignore
from oil import oil
from weaver import WebScraper
import weaver.enc as enc
def plog(msg: str, fname: str = "./pffn.log") -> None:
with open(fname, 'a') as f:
f.write(msg + '\n')
print(msg)
def prescrape(scraper: WebScraper, url: str) -> None:
print(f"url: {url}")
w = scraper.softScrape(url)
responseSize = len(w.response) if w.response is not None else 0
print(f"\tresponse size: {responseSize}B")
print(f"\trequest headers: {w.requestHeaders!r}")
print(f"\tresponse headers: {w.responseHeaders!r}")
dec = enc.decode(w.response, url)
if dec is None:
print("\tunknown encoding")
return
print(f"\tencoding: {dec[0]}")
html = dec[1]
soup = BeautifulSoup(html, 'html5lib')
print(f"\tdecoded size: {len(html)}B")
def main(db: 'psycopg2.connection') -> int:
scraper = WebScraper(db)
plog('==========')
plog(f"source: {scraper.source.__dict__}")
if len(sys.argv) == 2:
scraper.baseDelay = int(sys.argv[1])
print(f"baseDelay: {scraper.baseDelay}")
for line in sys.stdin:
try:
prescrape(scraper, line.strip())
except SystemExit as e:
raise
except:
pass
return 0
if __name__ == '__main__':
with oil.open() as db:
res = main(db)
sys.exit(res)