/* min-hash-k: Program to estimate Jaccard similarity using MinHash with k hash functions. James S. Plank CS494/CS594 - Advanced Algorithms and Programming October, 2017 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace std; void usage(const string s) { fprintf(stderr, "usage: min-hash-k k bytes-per-hash files\n"); if (s != "") fprintf(stderr, "%s\n", s.c_str()); exit(1); } int main(int argc, char **argv) { vector files; // Filenames vector min_hashes; // The minimum hashes for each file. int k; // The number of hashes int bbh; // Bytes per hash int hash_buf_size; // Size of the hash buffers (k*bbh) padded to 16 unsigned int ff; // An integer that holds 0xffffffff unsigned char *hash; // Where we calculate the hashes for each string. ifstream f; string s; int findex; int i, j, l, sz; double Intersection; /* Read the command line arguments. */ if (argc < 4) usage(""); k = atoi(argv[1]); if (k <= 0) usage("k must be a number > 0"); bbh = atoi(argv[2]); if (bbh <= 0) usage ("bbh must be a number > 0\n"); for (i = 3; i < argc; i++) files.push_back(argv[i]); /* Calculate the number of bytes for all of the hashes, and allocate a hash buffer for temporary use, and a hash buffer for each data set to hold the minimum hashes for each data set. Set each byte of these buffers to 0xff, which is their maximum value, regardless of the size of the hash. */ hash_buf_size = k * bbh; if (hash_buf_size % 16 != 0) hash_buf_size += (16 - hash_buf_size % 16); ff = 0xffffffff; hash = (unsigned char *) malloc(hash_buf_size); min_hashes.resize(files.size()); for (i = 0; i < min_hashes.size(); i++) { min_hashes[i] = (unsigned char *) malloc(hash_buf_size); for (j = 0; j < hash_buf_size; j += sizeof(int)) { memcpy(min_hashes[i]+j, &ff, sizeof(int)); } } /* Error check code #1: Print out the initial values of all the hashes, which should all be ff's */ // for (i = 0; i < min_hashes.size(); i++) { // printf("%20s ", files[i].c_str()); // for (j = 0; j < k * bbh; j++) printf("%02x", min_hashes[i][j]); // printf("\n"); // } // exit(1); /* Read the data sets. For each value, you're going to calculate the k hashes and then update the minimum hashes for the data set. */ for (findex = 0; findex < files.size(); findex++) { f.clear(); f.open(files[findex].c_str()); if (f.fail()) { perror(files[findex].c_str()); exit(1); } while (getline(f, s)) { if (s.size() < 2) { fprintf(stderr, "File %s - can't have one-character strings.\n", files[findex].c_str()); exit(1); } /* Here is where we calculate the hash_buf_size bytes of hashes. */ j = 0; sz = s.size(); for (i = 0; i < hash_buf_size; i += 16) { s[0] ^= (j & 0xff); s[1] ^= (j >> 8); MD5((unsigned char *) s.c_str(), sz, hash+i); s[0] ^= (j & 0xff); s[1] ^= (j >> 8); j++; } /* And here is where we compare each unit of bbh bytes with the unit in min_hashes, and if it's smaller, we set the bbh bytes of min_hashes to the bytes in hash: */ j = 0; for (i = 0; i < k * bbh; i += bbh) { if (memcmp(hash+i, min_hashes[findex]+i, bbh) < 0) { memcpy(min_hashes[findex]+i, hash+i, bbh); } } /* Error check code #2: Print the hashes and the min hashes. */ // printf("%-20s %-20s\n", files[findex].c_str(), s.c_str()); // printf(" hash: "); // for (i = 0; i < k*bbh; i++) printf("%s%02x", (i%bbh == 0) ? " " : "", hash[i]); // printf("\n minh: "); // for (i = 0; i < k*bbh; i++) printf("%s%02x", (i%bbh == 0) ? " " : "", min_hashes[findex][i]); // printf("\n"); } f.close(); } /* Error check code #3: Print out the min hashes, so that we can double-check. */ // for (findex = 0; findex < files.size(); findex++) { // printf("%-10s ", files[findex].c_str()); // for (i = 0; i < k*bbh; i++) printf("%s%02x", (i%bbh == 0) ? " " : "", min_hashes[findex][i]); // printf("\n"); // } /* For each pair of files, compare the hashes. */ for (i = 0; i < files.size(); i++) { for (j = 0; j < files.size(); j++) { Intersection = 0; for (l = 0; l < k*bbh; l += bbh) { if (memcmp(min_hashes[i]+l, min_hashes[j]+l, bbh) == 0) Intersection++; } printf("%-30s %-30s %.6lf\n", files[i].c_str(), files[j].c_str(), Intersection / (double) k); } } exit(0); }