-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfit2.R
156 lines (120 loc) · 5.57 KB
/
fit2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# This is an R script to estimate k and p parameters for NBD (Negative Binomial Distribution)
# assuming that word frequency distributions follow NBD
# Usage: Rscript myscript.R /path/to/inputfile.txt /path/to/outputfile.txt
# WARNING! Running this script can be compute-intensive operation, use at your own risk.
# No liability of any kind.
# Load the bbmle library
library("bbmle")
# Check if both input and output file names are provided
if (length(commandArgs(trailingOnly = TRUE)) < 2) {
stop("Please provide both input and output file names as command line arguments.")
}
# Get the input and output file names from the command line
input_file <- commandArgs(trailingOnly = TRUE)[1]
output_file <- commandArgs(trailingOnly = TRUE)[2]
# Read word frequencies from a flat text file
file1 <- read.table(file = input_file, head = TRUE, sep = " ", encoding = "UTF-8", fileEncoding = "UTF-8")
# Transpose the data for easier processing
filet <- t(file1)
# Define the negative log-likelihood function for Negative Binomial Distribution
minuslogl <- function(size, prob, x) {
-sum(dnbinom(x = x, size = size, prob = prob, log = TRUE))
}
# Initialize vectors for size, probability, and sqrt_kp, sum, and document frequency (DF)
vsize <- numeric(0)
vprob <- numeric(0)
sqrt_kp <- numeric(0)
pq_value <- numeric(0)
sum_vector <- numeric(0)
doc_freq <- numeric(0)
# Iterate through columns of the transposed data
for (i in c(1:nrow(filet))) {
# Extract numeric data from the current column
file <- as.numeric(filet[i, ])
# Calculate the sum for the current column
sum_vector[i] <- sum(file)
# Calculate the document frequency for the current column
# DF (document frequency) is in how many document a word is found
doc_freq[i] <- sum(file != 0)
# Calculate the word frequency to document frequency ratio (DF/WF) for each word
# Avoid division by zero by setting ratio to zero when sum_vector is zero
df_wf_ratio <- ifelse(sum_vector == 0, 0, doc_freq / sum_vector)
# Check if there is variability in the data
if (var(file) == 0) {
vsize[i] <- NaN
vprob[i] <- NaN
sqrt_kp[i] <- NaN
pq_value[i] <- NaN
} else {
# Fit Negative Binomial Distribution using maximum likelihood estimation
m <- mle2(
minuslogl = minuslogl,
start = list(size = 0.01, prob = 0.01),
data = list(x = file),
method = "L-BFGS-B",
lower = list(size = 1e-4, prob = 1e-4),
upper = list(size = 1e+5, prob = 0.9999)
)
# Store the estimated parameters and calculate sqrt_kp
vsize[i] <- coef(m)[1]
vprob[i] <- coef(m)[2]
sqrt_kp[i] <- sqrt(vsize[i] * vsize[i] + vprob[i] * vprob[i])
# p/q
pq_value[i] <- vprob[i] / (1 - vprob[i])
}
}
# Calculate Composite Relevance Index for each word
# Avoid division by zero by setting ratio to zero when sum_vector is zero
# cpi_calculate <- ifelse(sum_vector == 0, 0, sqrt_kp * df_wf_ratio)
# Assuming 'cpi_calculate' is a vector of CPI values
# Determine quartiles dynamically
# quartiles <- quantile(cpi_calculate, probs = c(0, 0.25, 0.5, 0.75, 1))
# Categorize CPI values
# cpi_categories <- cut(cpi_calculate,
# breaks = quartiles,
# labels = c("VLowCPI", "LowCPI", "MediumCPI", "HighCPI"),
# include.lowest = TRUE)
# Specify the number of clusters (you can adjust this based on your requirements)
num_clusters <- 12
# Create a data frame with word, size, probability, and sqrt_kp
# If you want to print actual word frequencies (such as for diagnositcs, change to this: "filet[, ]" )
# d <- data.frame(word = filet[, 0], k = vsize, p = vprob, sqrt_kp = sqrt_kp, sum = sum_vector, df = doc_freq, df_wf = df_wf_ratio, cpi = cpi_calculate, cpi_cat = cpi_categories)
# d <- data.frame(word = filet[, 0], k = vsize, p = vprob, sqrt_kp = sqrt_kp, sum = sum_vector, df = doc_freq, df_wf = df_wf_ratio, cluster = cluster)
### Now we will cluster words
# Create a data frame with vsize and vprob
# cluster_data <- data.frame(vsize, vprob)
scaled_kp <- scale(sqrt_kp)
cluster_data <- data.frame(scaled_kp, df_wf_ratio)
# Perform k-means clustering using k and p, $DF
kmeans_result <- kmeans(cluster_data, centers = num_clusters)
# Get the sum of squared distances for each cluster
ssd_per_cluster <- apply(kmeans_result$centers, 1, function(centroid) sum((cluster_data - centroid)^2))
# Order clusters based on sum of squared distances
ordered_clusters <- order(ssd_per_cluster)
# Map original cluster numbers to ordered cluster numbers
cluster_order_mapping <- match(kmeans_result$cluster, ordered_clusters)
# Add the ordered cluster assignment to the original data
cluster_data$ordered_cluster <- as.factor(cluster_order_mapping)
# Add the cluster assignment to the original data
#cluster_data$cluster <- as.factor(kmeans_result$cluster)
# Write the header to the output file
header <- "word\tk\tp\tsqrt_kp\tsum\tDF\tDFtoWF\tcluster\tpq"
write(header, file = output_file)
#scaled_pq_data <- scale(pq_value)
#scaled_k <- scale(vsize)
# Create a data frame with word, size, probability, and sqrt_kp
d <- data.frame(
word = filet[, 0], # First column contains words; to print actual word frequencies (such as for diagnositcs, change to this: "filet[, ]" )
k = vsize,
p = vprob,
sqrt_kp = sqrt_kp,
sum = sum_vector,
df = doc_freq,
df_wf = df_wf_ratio,
ordered_cluster = cluster_data$ordered_cluster,
pq = pq_value
#scaled_pq_data = scaled_pq_data,
#scaled_k = scaled_k
)
# Write the results to a tab-separated file with custom column names in the header
write.table(d, file = output_file, sep = "\t", quote = FALSE, row.names = TRUE, col.names = FALSE, append = TRUE)