-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbulkImport.py
More file actions
executable file
·40 lines (34 loc) · 939 Bytes
/
bulkImport.py
File metadata and controls
executable file
·40 lines (34 loc) · 939 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/usr/bin/env python
import tarfile
from oil import oil
from weaver import Web, Encoding, WebSource
total = 0
xzfname = 'min_bulk_dump.tar.xz'
with oil.open() as db:
source = WebSource.lookup(db, 'iris-bulk', 'iris-bulk')
encoding = Encoding.lookup(db, 'utf8')
with tarfile.open(xzfname, 'r:xz') as xzf:
for ti in xzf:
total += 1
if total % 100 == 0:
print(total)
fo = xzf.extractfile(ti)
assert(fo is not None)
html = str(fo.read().decode('utf-8'))
header, _, html = html.partition('\n')
ts = int(header.split('\t')[1])
url = str(header.split('\t')[2])
if len(Web.wcache(db, [url])) < 1:
print(f' {url}: {ts}')
w = Web(
created_ = ts,
url_ = url,
status_ = 200,
sourceId_ = source.id,
encoding_ = encoding.id,
response_ = html.encode('utf-8'),
requestHeaders_ = None,
responseHeaders_ = None,
wbaseId_ = None,
)
w.save(db)