Skip to content

Commit d410cde

Browse files
committed
Added w computation based on the average distance of the dataset vectors to all methods | Testing for different values of W for all methods
1 parent 0ae0036 commit d410cde

39 files changed

+35358
-71
lines changed

Clustering/clustering.c

Lines changed: 57 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,43 @@
1818
#define TRUE 1
1919
#define FALSE 0
2020
#define MAX_RECENTER_ITERATIONS 15
21+
#define W_DIVIDER 80
2122

2223
extern int numOfVecs;
2324
extern int d;
2425
extern int hashTableSize;
2526
extern int complete;
27+
extern int w;
28+
29+
30+
int wValueCalculation(List list,int numberOfVectorsInFile){
31+
long double sumDist = 0.0;
32+
int count=0;
33+
double persentageToCheck;
34+
if(numberOfVectorsInFile<=1000){
35+
persentageToCheck = 0.1;
36+
}else if(numberOfVectorsInFile<=10000){
37+
persentageToCheck = 0.001;
38+
}else if (numberOfVectorsInFile<=100000){
39+
persentageToCheck = 0.0001;
40+
}else{
41+
persentageToCheck = 0.000001;
42+
}
43+
int stopBound = persentageToCheck*numberOfVectorsInFile*numberOfVectorsInFile;
44+
while(list!=NULL){
45+
List nested = list;
46+
while(nested!=NULL){
47+
if(count>stopBound){
48+
return floor(sumDist/count);
49+
}
50+
sumDist += distance_metric(getVector(list),getVector(nested),d);
51+
count++;
52+
nested = getNext(nested);
53+
}
54+
list=getNext(list);
55+
}
56+
return floor(sumDist/count);
57+
}
2658

2759

2860

@@ -174,7 +206,7 @@ void reverseAssignmentLSH(LSH lsh,Vector *vectors,Vector *clusters,Vector *oldCl
174206
}
175207
// finally delete each cluster in order to form a new one based to the new centroid
176208
htDelete(clustersHt[i],0);
177-
clustersHt[i] = htInitialize(numOfVecs/(4*numOfClusters));
209+
clustersHt[i] = htInitialize(numOfVecs/(4*numOfClusters));
178210
// save the new centroid
179211
clusters[i]=newCenter;
180212
}
@@ -254,14 +286,24 @@ void clusteringLSH(List vecList,int numOfClusters,int l,FILE* fptr){
254286
}else{
255287
hashTableSize=numOfVecs/32;
256288
}
289+
257290
clock_t begin = clock();
291+
w = wValueCalculation(vecList,numOfVecs);
292+
w /= W_DIVIDER;
293+
clock_t end = clock();
294+
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
295+
printf("Found value of w in %f seconds, w = %d\n",time_spent,w );
296+
297+
298+
299+
begin = clock();
258300
LSH lsh = initializeLSH(l);
259301
for(int i=0;i<numOfVecs;i++){
260302
initializeClusterInfo(vectors[i]);
261303
insertToLSH(lsh,vectors[i]);
262304
}
263-
clock_t end = clock();
264-
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
305+
end = clock();
306+
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
265307
printf("Created LSH in : %f seconds\n",time_spent);
266308

267309

@@ -278,8 +320,8 @@ void clusteringLSH(List vecList,int numOfClusters,int l,FILE* fptr){
278320
while((countLSH<2) || !centroidsConverge(clusters,oldClusters,numOfClusters,d)){ // check for convergence after the second one iteration
279321
if(countLSH==MAX_RECENTER_ITERATIONS)
280322
break;
281-
// while(firstIter || count<20){
282323
countLSH++;
324+
printf("LOOP %d\n",countLSH);
283325
if(!firstIterLSH){
284326
Vector *temp = oldClusters;
285327
oldClusters=clusters;
@@ -425,13 +467,24 @@ void clusteringHypercube(List vecList,int numOfClusters,int m,int probes,FILE* f
425467
}
426468
props = calloc(numOfVecs,sizeof(double));
427469

470+
clock_t begin = clock();
471+
w = wValueCalculation(vecList,numOfVecs);
472+
w /= W_DIVIDER;
473+
clock_t end = clock();
474+
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
475+
printf("Found value of w in %f seconds, w = %d\n",time_spent,w );
476+
428477
// allocate and initialize the Hypercube with the vectors tha will be inserted into clusters
429478
hashTableSize=numOfVecs/16;
479+
begin = clock();
430480
HyperCube cube = initializeHyperCube();
431481
for(int i=0;i<numOfVecs;i++){
432482
initializeClusterInfo(vectors[i]);
433483
insertToHyperCube(cube,vectors[i]);
434484
}
485+
end = clock();
486+
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
487+
printf("Created Hypercube in : %f seconds\n",time_spent);
435488

436489
clock_t cluster_start = clock();
437490
// find the original centroids with the kmeans++ Algorithm

Hypercube/hypercube.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <limits.h>
66
#include "../Vector/vector.h"
77
#include "../hashTable/hashTable.h"
8+
#include "../hashTable/hashTableList/hashTableList.h"
89
#include "HashMap/hashmap.h"
910
#include "../LSH/helperFunctions.h"
1011

@@ -136,6 +137,16 @@ void insertToHyperCube(HyperCube hc,Vector v){
136137
htInsert(hc->hypercube,v,decimal_index,-1);
137138
}
138139

140+
void insertFromListToHyperCube(List list,HyperCube hc){
141+
// insert every vector of the list at the corresponding LSH
142+
if(list==NULL){ return;}
143+
List temp=list;
144+
while(temp!=NULL){
145+
insertToHyperCube(hc,getVector(temp));
146+
temp=getNext(temp);
147+
}
148+
}
149+
139150
void printHyperCube(HyperCube hc){
140151
printf("-------- HyperCube --------\n");
141152
htPrint(hc->hypercube);

Hypercube/hypercube.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ typedef struct hc_n *HyperCube;
66
HyperCube initializeHyperCube();
77

88
void insertToHyperCube(HyperCube ,Vector );
9+
void insertFromListToHyperCube(List ,HyperCube );
910

1011
void printHyperCube(HyperCube );
1112

Hypercube/hypercube.h.gch

28 Bytes
Binary file not shown.

cluster.conf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
number_of_clusters: 6
22
number_of_vector_hash_tables: 5
33
number_of_vector_hash_functions: 6
4-
max_number_M_hypercube: 500
5-
number_of_hypercube_dimensions: 4
4+
max_number_M_hypercube: 1000
5+
number_of_hypercube_dimensions: 5
66
number_of_probes: 16

mainCube.c

Lines changed: 54 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,50 @@
33
#include <string.h>
44
#include <time.h>
55
#include <unistd.h>
6+
#include <math.h>
67
#include "Vector/vector.h"
78
#include "./hashTable/hashTable.h"
89
#include "Hypercube/hypercube.h"
910
#include "./parsing/parsingCube.h"
1011
#include "./hashTable/hashTableList/hashTableList.h"
1112

12-
#define W_VALUE 4
13+
#define W_DIVIDER 80
1314

1415
int d;
1516
int new_dimension;
1617
int m;
1718
int probes;
1819
int w;
1920

21+
int wValueCalculation(List list,int numberOfVectorsInFile){
22+
long double sumDist = 0.0;
23+
int count=0;
24+
double persentageToCheck;
25+
if(numberOfVectorsInFile<=1000){
26+
persentageToCheck = 0.1;
27+
}else if(numberOfVectorsInFile<=10000){
28+
persentageToCheck = 0.001;
29+
}else if (numberOfVectorsInFile<=100000){
30+
persentageToCheck = 0.0001;
31+
}else{
32+
persentageToCheck = 0.000001;
33+
}
34+
int stopBound = persentageToCheck*numberOfVectorsInFile*numberOfVectorsInFile;
35+
while(list!=NULL){
36+
List nested = list;
37+
while(nested!=NULL){
38+
if(count>stopBound){
39+
return floor(sumDist/count);
40+
}
41+
sumDist += distance_metric(getVector(list),getVector(nested),d);
42+
count++;
43+
nested = getNext(nested);
44+
}
45+
list=getNext(list);
46+
}
47+
return floor(sumDist/count);
48+
}
49+
2050

2151

2252
void printOptions(){
@@ -43,7 +73,6 @@ int main(int argc, char *argv[]) {
4373
int n=1;
4474
int r=10000;
4575
int probes=2;
46-
w = W_VALUE;
4776

4877
while((option = getopt(argc, argv, "i:q:k:M:p:o:N:R:")) != -1){
4978
switch(option){
@@ -161,19 +190,38 @@ int main(int argc, char *argv[]) {
161190
}
162191
}
163192

164-
HyperCube hc;
193+
165194
List list;
166195
int repeat=1;
167196
char command[200];
168197
clock_t begin = clock();
169198
d = findDim(inputFile);
170199
printf("DIMENSION = %d\n",d);
171-
hc = initializeHyperCube();
200+
172201
list = initializeList();
173-
readFile(inputFile,hc,&list);
202+
int numberOfVectorsInFile = 0;
203+
readFile(inputFile,&list,&numberOfVectorsInFile);
174204
clock_t end = clock();
175205
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
176-
printf("Created Hypercube in : %f seconds\n",time_spent);
206+
printf("Parsed input file in : %f seconds\n",time_spent);
207+
printf("Number of vectors in input file: %d\n",numberOfVectorsInFile);
208+
209+
printf("Findind optimal value of w based on the input file\n");
210+
begin = clock();
211+
w = wValueCalculation(list,numberOfVectorsInFile);
212+
w /= W_DIVIDER;
213+
end = clock();
214+
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
215+
printf("Found value of w in %f seconds, w = %d\n",time_spent,w );
216+
217+
HyperCube hc;
218+
begin = clock();
219+
hc = initializeHyperCube();
220+
insertFromListToHyperCube(list,hc);
221+
end = clock();
222+
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
223+
printf("Created HyperCube in : %f seconds\n",time_spent);
224+
177225
while(1){
178226
if(repeat){
179227
readQueryFile(queryFile,outputFile,hc,list,n,r,probes,m);

mainLSH.c

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
#include "./parsing/parsingLSH.h"
1111
#include "./hashTable/hashTableList/hashTableList.h"
1212

13-
#define W_VALUE 4
13+
#define W_DIVIDER 10
1414

1515
int d;
1616
int w;
@@ -20,12 +20,21 @@ int hashTableSize;
2020
int wValueCalculation(List list,int numberOfVectorsInFile){
2121
long double sumDist = 0.0;
2222
int count=0;
23-
int stopBound = 0.001*numberOfVectorsInFile*numberOfVectorsInFile;
23+
double persentageToCheck;
24+
if(numberOfVectorsInFile<=1000){
25+
persentageToCheck = 0.1;
26+
}else if(numberOfVectorsInFile<=10000){
27+
persentageToCheck = 0.001;
28+
}else if (numberOfVectorsInFile<=100000){
29+
persentageToCheck = 0.0001;
30+
}else{
31+
persentageToCheck = 0.000001;
32+
}
33+
int stopBound = persentageToCheck*numberOfVectorsInFile*numberOfVectorsInFile;
2434
while(list!=NULL){
2535
List nested = list;
2636
while(nested!=NULL){
2737
if(count>stopBound){
28-
printf("%d\n",count);
2938
return floor(sumDist/count);
3039
}
3140
sumDist += distance_metric(getVector(list),getVector(nested),d);
@@ -59,7 +68,6 @@ int main(int argc, char *argv[]) {
5968
double radius=10000;
6069
hashTableSize = 1000;
6170
k_LSH = 4;
62-
w = W_VALUE;
6371

6472
while((option = getopt(argc, argv, "i:q:k:L:o:N:R:")) != -1){
6573
switch(option){
@@ -163,8 +171,7 @@ int main(int argc, char *argv[]) {
163171
printf("Findind optimal value of w based on the input file\n");
164172
begin = clock();
165173
w = wValueCalculation(list,numberOfVectorsInFile);
166-
w /= 10;
167-
// w=6;
174+
w /= W_DIVIDER;
168175
end = clock();
169176
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
170177
printf("Found value of w in %f seconds, w = %d\n",time_spent,w );

outputCluster

Lines changed: 0 additions & 46 deletions
This file was deleted.

outputCluster_BIG_w1_l5

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
Algorithm: Range Search LSH
2+
CLUSTER-1 {size: 161974
3+
[ 60.464223 18.693490 5.382828 4.996876 8.492653 8.664305 9.263454 21.110362 71.814831 44.808515 27.576314 14.581446 10.665632 7.918319 7.549484 19.352663 18.364145 26.488991 36.819615 31.566316 26.032540 11.324131 6.940670 7.948450 12.128685 13.837516 17.573449 16.631043 19.158530 11.332360 5.713024 6.472064 99.761661 24.847507 6.420915 7.399040 13.180752 13.445370 13.325636 37.896995 112.959020 55.092612 24.510451 11.247112 8.074099 7.911424 9.974313 40.902923 19.647494 22.949539 36.378622 38.883213 41.704223 36.430526 17.522083 9.861405 16.128828 12.982481 19.152649 25.070458 37.655811 40.337356 21.497254 11.436257 89.571938 15.986708 6.881300 8.360939 13.577213 13.692666 17.070703 53.297948 103.104482 15.117305 4.985020 4.236638 6.024618 32.387225 82.506129 99.456730 14.801742 6.305584 6.297288 8.655586 28.653041 100.199442 101.755059 28.909757 21.383617 9.297070 5.942406 8.489420 24.743175 69.867803 63.340786 24.499153 27.453776 8.965452 6.323750 7.148579 10.450992 11.490454 15.399632 24.526180 30.716651 9.171720 8.173510 6.866735 10.019227 35.601565 77.727766 55.849535 18.497955 12.086364 10.402117 6.387061 10.894906 55.518414 79.039966 27.006765 21.616961 11.500841 7.019057 4.338050 7.307170 22.389641 30.948021 19.399564 ]
4+
}
5+
CLUSTER-2 {size: 144847
6+
[ 25.200319 22.830369 18.977809 12.913178 9.859758 7.404882 8.198532 11.331699 25.357029 49.478297 79.407178 32.020088 10.262816 10.114865 12.898163 12.070129 26.408446 34.191589 79.750746 39.783305 9.375137 7.810854 12.779772 17.510237 24.791710 21.872276 36.172747 20.224156 7.965114 4.840042 6.850805 13.906761 84.586915 45.804591 19.998407 15.675870 13.429477 9.270640 9.193089 25.223746 94.338828 86.720576 87.705012 29.539495 7.297795 7.785215 8.708793 25.002428 21.663028 35.028376 99.044590 79.039881 25.086891 9.374828 8.239349 12.130337 25.335210 24.001021 58.675808 54.844442 25.393964 9.618669 7.342632 13.605904 90.501369 25.219803 15.436204 17.744112 14.895573 8.010639 7.863578 37.054425 109.536124 29.514729 14.771189 11.787264 9.317534 9.921583 18.960815 73.162015 28.030618 13.025067 19.582716 32.859318 39.258184 31.993892 30.366423 27.496544 19.710633 10.886743 17.723476 34.780375 37.249594 21.777375 17.725246 16.180254 51.232522 15.114950 11.415707 11.914624 9.837947 5.263048 5.842570 23.738174 80.673535 17.356480 11.756816 10.989345 10.130870 11.316011 21.508783 58.577876 26.114084 9.534330 9.572394 13.878191 25.363887 27.095551 31.650633 31.230651 13.485348 6.293125 6.233952 12.962554 20.474748 15.122443 16.071165 14.858498 ]
7+
}
8+
CLUSTER-3 {size: 234404
9+
[ 19.145945 9.264190 7.418855 11.430172 17.337017 16.213624 17.137042 16.590893 26.420827 13.151162 13.784275 13.974747 24.788459 34.447053 37.592707 30.171546 63.754624 16.576164 14.955170 13.497323 12.876680 18.334848 30.930388 53.008329 42.779080 14.828750 12.355969 13.332482 13.052336 8.029930 8.448090 22.719452 28.967858 13.344652 14.725080 26.716533 34.257233 28.746930 22.351725 19.776135 30.903510 15.438063 18.579894 28.145461 40.651872 44.147317 37.760283 27.393306 97.485037 25.719152 16.562875 14.804453 12.328855 15.558169 27.783894 70.386743 80.081255 23.726979 16.828634 20.436567 19.788001 12.134894 11.738330 38.691777 32.278508 23.015621 44.629885 43.813562 28.806305 14.568105 9.332417 16.637760 27.478440 32.092520 80.339778 68.606956 32.380484 15.112260 12.148757 16.896116 88.346894 73.614912 71.888233 28.289666 10.761277 12.468696 12.900998 27.574317 78.068772 39.203435 21.074516 18.509519 17.957802 14.174251 13.375062 28.828737 27.092031 22.385405 31.232237 19.864068 10.545780 6.071905 7.129969 14.332010 29.837229 36.286785 70.519622 42.610672 13.818556 9.789531 14.836449 19.649728 31.563968 48.188594 71.166102 35.764454 13.077032 13.225601 15.304164 13.894564 29.042696 22.192499 19.974678 16.129043 12.597281 9.474728 9.862178 13.496127 ]
10+
}
11+
CLUSTER-4 {size: 169144
12+
[ 27.441893 22.800016 15.010798 12.290587 11.400912 8.776578 8.100822 12.013422 28.254462 54.453407 68.377049 35.538128 13.972508 9.288004 9.222154 9.689570 20.526959 34.113947 70.749369 54.041933 17.500789 7.757261 9.284496 11.502603 22.178029 20.426428 24.581437 21.972211 12.694608 6.602015 6.277104 11.264565 81.408783 37.726343 14.545770 12.535107 13.792671 13.379268 13.420593 28.603169 94.633439 81.371970 64.542094 28.338043 9.351958 8.653451 10.132644 28.923123 18.739259 29.937826 76.713629 84.129082 53.548408 22.694163 10.890174 10.867275 25.854730 19.466606 30.431560 48.455660 45.557641 21.313223 10.919433 14.598555 80.969068 20.202992 10.678158 12.284753 14.698055 13.480919 15.136164 44.882789 98.122162 21.135382 8.268241 7.429146 10.133284 27.465386 55.142295 90.672559 19.208489 8.021948 9.079535 20.734423 56.792084 89.780922 72.795113 30.340134 26.541352 11.263554 9.247420 19.418185 45.817984 52.237350 32.541589 23.248520 30.179466 10.791626 7.820682 9.417632 11.891564 10.797264 12.497289 24.388623 36.816082 9.206162 7.232224 8.641756 14.479132 33.866284 61.461190 60.558080 20.365029 9.462317 7.883854 8.491650 22.032355 62.436693 67.223361 31.290034 23.804663 9.930751 6.412985 7.035377 13.331251 23.297750 23.297406 20.809216 ]
13+
}
14+
CLUSTER-5 {size: 34773
15+
[ 24.437149 17.855954 16.661871 14.772780 12.931137 12.494650 14.135333 13.787919 33.348136 35.699252 41.686295 27.865766 16.301470 24.452619 29.509834 19.885449 45.587862 25.266783 35.912733 34.234744 19.724128 16.698575 26.841026 33.682683 39.185129 16.395224 15.562956 15.719869 14.102647 7.852628 8.554770 20.070834 46.117485 27.102015 22.214489 24.539239 25.380160 25.858256 22.214188 21.825812 58.380807 43.953214 39.773292 29.235692 24.222916 34.544251 34.434139 29.199184 63.358595 24.647214 37.198628 47.882756 37.855262 20.103956 26.726881 43.837580 65.736452 20.711923 20.250816 28.216581 29.318084 13.506478 11.567095 33.368000 44.968247 20.607621 27.605333 30.715941 26.749605 18.299544 14.877564 25.109028 58.242163 26.025267 39.873862 37.009842 25.299093 20.134099 26.139291 43.808596 60.355992 35.906412 32.760560 31.468691 41.663647 31.891574 27.193932 31.134626 62.290741 22.020857 13.017652 21.789093 32.729274 19.727363 16.420686 30.987269 26.156266 13.929316 17.441348 14.964786 14.141728 11.126770 12.391687 17.475990 37.110578 20.077832 33.284510 25.777641 18.387443 20.070593 30.423229 34.612963 32.684866 24.812779 29.905690 21.252080 25.916721 28.938677 32.586221 26.787046 32.069359 13.267490 9.047178 11.689711 17.126529 13.210667 15.163609 20.818347 ]
16+
}
17+
CLUSTER-6 {size: 254858
18+
[ 28.385282 13.575424 8.075108 6.689389 10.680599 20.248477 27.674719 22.500029 26.251601 17.674256 16.375451 11.066509 16.703925 52.468747 66.442167 30.645448 34.138869 12.766403 15.637887 13.968577 15.421942 37.798293 64.252838 49.578513 29.456397 11.898695 10.321916 10.966778 14.741676 14.511096 15.914383 22.778172 34.489466 14.124298 8.769267 13.652763 29.693918 49.204865 46.319121 26.915241 28.641470 15.609151 13.603244 15.164281 34.702398 77.763214 77.972934 30.169514 88.110394 19.367909 14.473701 14.077615 12.840321 30.170046 62.229049 78.654770 75.680523 20.347298 13.889771 16.846404 21.280402 17.994732 18.164392 43.521919 31.293847 16.269767 20.955136 27.610093 34.980594 27.579639 17.456263 18.618041 31.125152 23.563242 40.418773 45.453894 40.707547 28.721367 20.078063 20.718098 92.506100 51.589049 32.110049 18.114463 13.358653 16.556431 18.188422 35.935060 80.319382 27.379038 11.747155 13.824835 20.043428 19.536784 17.767288 34.168002 21.035971 15.185264 17.448848 15.914850 16.304401 10.111948 7.729579 12.320660 25.921833 26.478954 39.415989 35.049508 23.958535 13.166076 14.784236 17.852866 53.242811 40.405546 35.657155 22.209065 14.004219 13.864535 15.730302 19.672357 44.825865 18.501916 9.623239 9.215071 12.053631 11.277167 11.881578 19.482116 ]
19+
}
20+
clustering_time: 302.399260 seconds

0 commit comments

Comments
 (0)