/* min-hash-1: Program to estimate Jaccard similarity using MinHash with one hash function. James S. Plank CS494/CS594 - Advanced Algorithms and Programming October, 2017 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace std; void usage(const string s) { fprintf(stderr, "usage: min-hash-1 k files\n"); if (s != "") fprintf(stderr, "%s\n", s.c_str()); exit(1); } int main(int argc, char **argv) { vector files; // Filenames vector < set > min_hashes; // The k minimum hashes for each file. int k; // The number of hash values to keep ifstream f; string s; int findex; unsigned char hash[16]; unsigned long long ll; int i, j; set ::iterator liti, litj; double Intersection; double Union; double Total; /* Read the command line arguments. */ if (argc < 3) usage(""); k = atoi(argv[1]); if (k <= 0) usage("k must be a number > 0"); for (i = 2; i < argc; i++) files.push_back(argv[i]); min_hashes.resize(files.size()); /* Read the data sets. For each value, you're going to calculate one hash, decide if it should go into the set, and if so, insert it. If the hash set is to big, then delete the smallest element. (I.e., we're keeping track of the maximum hashes now, because that's easier to do than keeping track of the minimum hashes. */ for (findex = 0; findex < files.size(); findex++) { f.clear(); f.open(files[findex].c_str()); if (f.fail()) { perror(files[findex].c_str()); exit(1); } while (getline(f, s)) { MD5((unsigned char *) s.c_str(), s.size(), hash); memcpy((unsigned char *) &ll, hash, sizeof(long long)); /* Error check code 1: Print out the hashes. */ // printf("%-20s 0x%016llx\n", s.c_str(), ll); if (min_hashes[findex].size() < k) { min_hashes[findex].insert(ll); } else { liti = min_hashes[findex].begin(); if (ll > *liti) { min_hashes[findex].insert(ll); if (min_hashes[findex].size() > k) min_hashes[findex].erase(liti); } } } f.close(); } /* Error check code #2: Print out the min hashes. */ // for (findex = 0; findex != files.size(); findex++) { // printf("%s\n", files[findex].c_str()); // for (liti = min_hashes[findex].begin(); liti != min_hashes[findex].end(); liti++) { // printf(" 0x%016llx\n", *liti); // } // } /* For each pair of files, compare the hashes. */ for (i = 0; i < files.size(); i++) { for (j = 0; j < files.size(); j++) { liti = min_hashes[i].begin(); litj = min_hashes[j].begin(); Intersection = 0; while (liti != min_hashes[i].end() && litj != min_hashes[j].end()) { if (*liti == *litj) { Intersection++; liti++; litj++; } else if (*liti < *litj) { liti++; } else { litj++; } } Total = min_hashes[i].size() + min_hashes[j].size(); Union = Total - Intersection; printf("%-30s %-30s %.6lf\n", files[i].c_str(), files[j].c_str(), Intersection / Union); } } exit(0); }