-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathimport_export_worldbank.py
More file actions
73 lines (57 loc) · 1.85 KB
/
import_export_worldbank.py
File metadata and controls
73 lines (57 loc) · 1.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from multiprocessing.dummy import Pool as ThreadPool
#import urlparse2
#import html2text
from bs4 import BeautifulSoup
import urllib
import urllib.request
import re
import os
import csv
import re
import requests
import time
from random import choice
import wikipedia
#import cv2
import numpy as np
import matplotlib.pyplot as plt
from multiprocessing.dummy import Pool as ThreadPool
user_agents = [
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
]
def make_soup(url):
req = urllib.request.Request(
url,
data=None,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
)
data = urllib.request.urlopen(req)
soup = BeautifulSoup(data,'html.parser')
return soup
url = "http://wits.worldbank.org/countrystats.aspx?lang=en"
soup = make_soup(url)
l = []
for i in soup.findAll('h3',{"class":"countryHeading"}):
for j in i.findAll('a'):
l.append([j.text,j['href']])
#print(l) ## Contains country name and their url
ind = [i for i in range(len(l))]
def fun(i):
url = l[i][1]
soup = make_soup(url)
tmp = []
for j in soup.findAll('td',{"class":"data importIndicator alignRight"}):
tmp.append(j.text)
(l[i]).append(tmp[0])
(l[i]).append(tmp[1])
with ThreadPool(20) as pool:
results = pool.map(fun, ind)
for i in l:
print(i)