/* jaccard-sort-hash: Program to calculate Jaccard similarity using sorted vectors of hashes. James S. Plank CS494/CS594 - Advanced Algorithms and Programming October, 2017 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace std; void usage(const string s) { fprintf(stderr, "usage: jaccard-sort-hash files\n"); if (s != "") fprintf(stderr, "%s\n", s.c_str()); exit(1); } int main(int argc, char **argv) { vector < vector > sets; unsigned char md5_buf[16]; unsigned long long ull; ifstream f; string s; int i, j, ip, jp; double Intersection; double Total; double Union; if (argc == 1) usage(""); /* Read the data sets and store the hashes into vectors. */ sets.resize(argc-1); for (i = 1; i < argc; i++) { f.clear(); f.open(argv[i]); if (f.fail()) { perror(argv[i]); exit(1); } while (getline(f, s)) { MD5((const unsigned char *) s.c_str(), s.size(), md5_buf); memcpy(&ull, md5_buf, sizeof(unsigned long long)); sets[i-1].push_back(ull); } f.close(); } /* Sort the vectors. */ for (i = 0; i < sets.size(); i++) sort(sets[i].begin(), sets[i].end()); /* For each pair of sets, calculate the Jaccard similarity directly. */ for (i = 0; i < sets.size(); i++) { for (j = 0; j < sets.size(); j++) { Total = sets[i].size() + sets[j].size(); Intersection = 0; ip = 0; jp = 0; while (ip < sets[i].size() && jp < sets[j].size()) { if (sets[i][ip] == sets[j][jp]) { Intersection++; ip++; jp++; } else if (sets[i][ip] < sets[j][jp]) { ip++; } else { jp++; } } Union = Total - Intersection; printf("%-30s %-30s %.6lf\n", argv[i+1], argv[j+1], Intersection / Union); } } exit(1); }