-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathwordclusters.jl
82 lines (67 loc) · 1.96 KB
/
wordclusters.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
mutable struct WordClusters
vocab::Vector{String}
clusters::Vector{Int}
vocab_hash::Dict{String, Int}
function WordClusters(vocab, clusters)
vocab_hash = Dict{String, Int}()
for (i, word) in enumerate(vocab)
vocab_hash[word] = i
end
new(vocab, clusters, vocab_hash)
end
end
function show(io::IO, wc::WordClusters)
print(io, "WordClusters $(length(wc.vocab)) words, $(length(unique(wc.clusters))) clusters")
end
"""
vocabulary(wc)
Return all the vocabulary of the WordClusters `wc`.
"""
vocabulary(wc::WordClusters) = wc.vocab
"""
in_vocabulary(wc, word)
For the WordCluters `wc`, return `true` if `word` is part of the
vocabulary of `wc` and `false` otherwise.
"""
in_vocabulary(wc::WordClusters, word::AbstractString) = word in wc.vocab
"""
index(wc, word)
Return the index of `word` from the WordCluaters `wc`.
"""
index(wc::WordClusters, word) = wc.vocab_hash[word]
"""
get_cluster(wc, word)
Return the cluster number for a word in the vocabulary.
"""
get_cluster(wc::WordClusters, word) = (idx = index(wc, word); wc.clusters[idx])
"""
clusters(wc)
Return all the clusters from the WordClusters `wc`.
"""
clusters(wc::WordClusters) = sort(unique(wc.clusters))
"""
get_words(wc, cluster)
For the WordCluster `wc`, return all the words from a given cluster
number `cluster`.
"""
function get_words(wc::WordClusters, cluster::Int)
inds = findall(in(cluster), wc.clusters)
return wc.vocab[inds]
end
"""
wordclusters(fname)
Generate a WordClusters type object from the text file `fname`.
"""
function wordclusters(fname::AbstractString)
vocab = String[]
clusters = Int[]
open(fname) do f
entries = split(strip(readline(f)), ' ')
while length(entries) == 2
push!(vocab, entries[1])
push!(clusters, parse(Int, entries[2]))
entries = split(strip(readline(f)), ' ')
end
end
return WordClusters(vocab, clusters)
end