-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcms.py
More file actions
executable file
·122 lines (95 loc) · 2.79 KB
/
cms.py
File metadata and controls
executable file
·122 lines (95 loc) · 2.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import hashlib
import numpy as np
import matplotlib.pyplot as plt
import random
import sys
from tqdm import tqdm
def main():
s = str(123)
print("Using Seed: %s" % s)
data = []
filename = "stream.txt"
print("Opening File: " + filename)
with open(filename, "r", encoding="utf-8") as f:
data = [line.rstrip() for line in f]
realCount, tags = getRealCount(data)
print("\n\nLEAST POPULAR:")
list.sort(data, key = lambda tag : realCount[tag])
for i in range(20):
s = str(i)
count = cms()
for d in data:
count.Inc(d, s)
print("%d, %d" % (count.Count("workout", s), countHH(count, tags, s)))
print("\n\nMOST POPULAR:")
list.sort(data, key = lambda tag : - realCount[tag])
for i in range(20):
s = str(i)
count = cms()
for d in data:
count.Inc(d, s)
print("%d, %d" % (count.Count("workout", s), countHH(count, tags, s)))
print("\n\nRANDOM:")
random.shuffle(data)
for i in range(20):
s = str(i)
count = cms()
for d in data:
count.Inc(d, s)
print("%d, %d" % (count.Count("workout", s), countHH(count, tags, s)))
"""
count.Inc(line.rstrip(), s)
tags.add(line.rstrip())
heavyHitterThresh = int(count.Total() / 100)
print("Heavy Hitter Threshold: %d" % heavyHitterThresh)
for tag in tags:
if count.Count(tag, s) > heavyHitterThresh:
hh.add(tag)
print(tag)
print(len(hh))
"""
class cms:
def __init__(self, l : int = 5, b : int = 256, conUpdate : bool = True):
self.CMS = np.zeros((l, b), dtype=int)
self.l = l
self.b = b
self.total = 0
self.conUpdate = conUpdate
def Inc(self, x : str, s : str) -> None:
self.total += 1
hx = hashlib.md5((x + s).encode("utf-8")).digest()
if not self.conUpdate:
for i in range(self.l):
self.CMS[i, hx[i] % self.b] += 1
else:
minVal = self.CMS[0, hx[0] % self.b]
for i in range(1, self.l):
minVal = min(minVal, self.CMS[i, hx[i] % self.b])
for i in range(self.l):
if self.CMS[i, hx[i] % self.b] == minVal:
self.CMS[i, hx[i] % self.b] += 1
def Count(self, x : str, s : int) -> int:
hx = hashlib.md5((x + s).encode("utf-8")).digest()
minCount = self.CMS[0, hx[0] % self.b]
for i in range(1, self.l):
minCount = min(minCount, self.CMS[i, hx[i] % self.b])
return minCount
def Total(self) -> int:
return self.total
def getRealCount(data : list[str]) -> (dict[str, int], set[str]):
realCount = {}
tags = set()
for d in data:
if d not in realCount:
realCount[d] = 0
realCount[d] += 1
tags.add(d)
return realCount, tags
def countHH(count : cms, tags : set[str], s : int) -> int:
num = 0
for tag in tags:
if count.Count(tag, s) > int(count.Total() / 100):
num += 1
return num
if __name__ == "__main__":
main()