-
Notifications
You must be signed in to change notification settings - Fork 4
/
file_process.py
89 lines (68 loc) · 2.94 KB
/
file_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import csv, re
data_all = open('output_len80_total1000.csv', 'w', encoding='utf-8')
emoji_count = open('emojicount_len80_total1000.csv', 'w', encoding='utf-8')
output = csv.writer(data_all)
emoji_count_f = csv.writer(emoji_count)
emoji_list = []
min_length = 80
with open('emoji.txt', 'r') as f:
emoji_list = f.read().splitlines()
print(emoji_list)
for emoji in emoji_list:
count = 0
## emoji.csv
with open(emoji + '.csv', 'r', encoding="utf-8") as f:
data = csv.reader(f)
data.__next__()
for tokens in data:
tweet = tokens[2]
label = emoji
## regexp
regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
url = re.findall(regex, tweet)
url_list = [x[0] for x in url]
if len(url_list) == 0 and len(tweet) >= min_length:
output.writerow([tweet, len(tweet), label])
count += 1
# emoji_count_f.writerow([emoji, count])
## emoji2.csv
with open(emoji + '2.csv', 'r', encoding="utf-8") as f:
data = csv.reader(f)
data.__next__()
for tokens in data:
tweet = tokens[2]
label = emoji
## regexp
regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
url = re.findall(regex, tweet)
url_list = [x[0] for x in url]
if len(url_list) == 0 and len(tweet) >= min_length:
output.writerow([tweet, len(tweet), label])
count += 1
if count < 1000:
continue
else:
break
# emoji_count_f.writerow([emoji, count])
if count < 1000:
with open(emoji + '3.csv', 'r', encoding="utf-8") as f:
data = csv.reader(f)
data.__next__()
for tokens in data:
tweet = tokens[2]
label = emoji
## regexp
regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
url = re.findall(regex, tweet)
url_list = [x[0] for x in url]
if len(url_list) == 0 and len(tweet) >= min_length:
output.writerow([tweet, len(tweet), label])
count += 1
if count < 1000:
continue
else:
break
# emoji_count_f.writerow([emoji, count])
emoji_count_f.writerow([emoji, count])
data_all.close()
emoji_count.close()