This repository was archived by the owner on Nov 15, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfunctions.py
More file actions
103 lines (81 loc) · 2.16 KB
/
functions.py
File metadata and controls
103 lines (81 loc) · 2.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from nltk.tokenize import RegexpTokenizer
import nltk, re
import nltk, re
def build_params(filename):
print('build_params')
fp = open(filename, 'r')
Xbig = []
y = []
i = 0
for line in fp.readlines():
x = []
line = line.split(' ')
j = 1
line_len = len(line)
for p in line :
if j < line_len:
x.append(int(p))
else:
y.append(int(p.strip('\n')))
j += 1
Xbig.append(x)
i += 1
print(i)
fp.close()
return {'x': Xbig, 'y': y}
_stemm_tokenizer = RegexpTokenizer(r'\w+')
def stemm(text):
text = text.lower()
text = _replace_important(text)
# removing all numbers
text = re.sub(r'\d+', 'number', text)
tokens = _stemm_tokenizer.tokenize(text)
russian_stemmer = nltk.stem.snowball.RussianStemmer()
# TODO: should we remove 2 letter words?
return [russian_stemmer.stem(t) for t in tokens]
_important_words = [
(':*', ' KISS '),
(';*', ' KISS '),
('=*', ' KISS '),
('!?', ' EXCLRIDDL '),
('?!', ' EXCLRIDDL '),
('!', ' EXCLAMATION '),
('?', ' QUESTION '),
(':-)', ' SMILE '),
('=)', ' SMILE '),
('))', ' SMILE '),
('((', ' ANTISMILE '),
(':-(', ' ANTISMILE '),
(';-(', ' ANTISMILE '),
(';-)', ' WINK '),
(':)', ' SMILE '),
(';)', ' WINK '),
(':3', ' CATFACE '),
('^_^', ' CATFACE '),
(':d', ' BIGSMILE '),
(';d', ' BIGSMILE '),
(':-d', ' BIGSMILE '),
('..', ' ETCR '),
('яя', ' GREATI '),
('ла-ла', ' SINGSONG '),
('+100500', ' PLUSBIGNUMBER '),
('100500', ' BIGNUMBER '),
('$', ' DOLLAR '),
('ну-ну', ' SARCASM '),
('ммм', ' DREAMING '),
('ааа', ' SCARY '),
('(c)', ' COPYRIGHT '),
('%', ' PERCENT ')
]
def _replace_important(text):
for (word, replacement) in _important_words:
text = text.replace(word, replacement)
return text
def load_dict(filename):
dictFile = open(filename, 'r')
wd = []
for line in dictFile.readlines() :
line = line.split('\t')
wd.append(line[1].strip('\n'))
dictFile.close()
return wd