#ifndef _Dataproc #define _Dataproc #include <iostream> #include <vector> using namespace std; class Dataproc { public: void add_value(double d); double mean(); int nentries(); double variance(); double stddev(); double max(); double min(); protected: vector <double> v; }; #endif |
We will add values to an instance of the Dataproc struct, and then we can ask various things about the values, such as the max, the min, the mean, the variance and the standard deviation.
Our implementation uses a vector to store all the data, and then implements the methods using the vector. The methods are all fairly simple. Add_value() appends the value to the vector. Nentries() returns the size of the vector. Max() and Min() traverse the vector to find the max and min values. (In dataproc.cpp):
#include "dataproc.h" #include <iostream> #include <math.h> #include <vector> using namespace std; void Dataproc::add_value(double d) { v.push_back(d); } int Dataproc::nentries() { return v.size(); } double Dataproc::min() { double m; int i; if (v.size() == 0) { cerr << "Dataproc::min() - Empty Dataproc\n"; exit (1); } m = v[0]; for (i = 1; i < v.size(); i++) if(v[i] < m) m = v[i]; return m; } double Dataproc::max() { double m; int i; if (v.size() == 0) { cerr << "Dataproc::max() - Empty Dataproc\n"; exit (1); } m = v[0]; for (i = 1; i < v.size(); i++) if(v[i] > m) m = v[i]; return m; } |
The program dp_test1.cpp performs a simple test of these functionalities:
#include "dataproc.h" #include <iostream> #include <set> using namespace std; main() { Dataproc dp; double i; cout << "Adding 1, 2, 3, 4 and 5\n"; for (i = 1; i <= 5; i++) dp.add_value(i); cout << "Nentries: " << dp.nentries() << endl; cout << "Min: " << dp.min() << endl; cout << "Max: " << dp.max() << endl; } |
It runs as you would expect:
UNIX> make dp_test1 g++ -c dp_test1.cpp g++ -c dataproc.cpp g++ -o dp_test1 dp_test1.o dataproc.o UNIX> dp_test1 Adding 1, 2, 3, 4 and 5 Nentries: 5 Min: 1 Max: 5 UNIX>Recall the definitions of mean, variance and standard deviation:
double Dataproc::mean() { int i; double n, total; n = v.size(); if (n == 0) { cerr << "Dataproc::mean() - Empty Dataproc\n"; exit (1); } total = 0; for (i = 0; i < v.size(); i++) total += v[i]; return total/n; } double Dataproc::variance() { double n, vtotal, m, diff; int i; m = mean(); n = v.size(); vtotal = 0; for (i = 0; i < v.size(); i++) { diff = (v[i] - m); vtotal += (diff*diff); } return vtotal/n; } double Dataproc::stddev() { return sqrt(variance()); } |
We can test these with dp_test2.cpp. The mean of [1,2,3,4,5] is 3, and the variance is ((1-3)2 + (2-3)2 + (3-3)2 + (4-3)2 + (5-3)2)/5 = 2.
UNIX> dp_test2 Adding 1, 2, 3, 4 and 5 Nentries: 5 Min: 1 Max: 5 Mean: 3 Variance: 2 Stddev: 1.41421 UNIX>
If you massage the equation for variance, you can see how to implement it too:
So, now we implement a new Dataproc in dataproc_good.h The methods are the same, but the implementation is different. Since we don't use a vector, we need to have a constructor which sets the initial values.
#ifndef _Dataproc #define _Dataproc #include <iostream> #include <set> using namespace std; class Dataproc { public: Dataproc(); void add_value(double d); double mean(); int nentries(); double variance(); double stddev(); double max(); double min(); protected: double total; double sqtotal; double n; double vmax; double vmin; }; #endif |
The rest of the implementation is quite easy (dataproc_good.cpp):
#include "dataproc.h" #include <iostream> #include <math.h> using namespace std; Dataproc::Dataproc() { n = 0; total = 0; sqtotal = 0; } void Dataproc::add_value(double d) { if (n == 0) { vmax = d; vmin = d; } total += d; sqtotal += (d*d); n++; if (d < vmin) vmin = d; if (d > vmax) vmax = d; } int Dataproc::nentries() { return (int) n; } double Dataproc::min() { if (n == 0) { cerr << "Dataproc::min() - Empty dataproc\n"; exit (1); } return vmin; } double Dataproc::mean() { if (n == 0) { cerr << "Dataproc::mean() - Empty dataproc\n"; exit (1); } return total / n; } double Dataproc::variance() { double m; m = mean(); return (sqtotal/n - m*m); } double Dataproc::stddev() { return sqrt(variance()); } double Dataproc::max() { if (n == 0) { cerr << "Dataproc::max() - Empty dataproc\n"; exit (1); } return vmax; } |
The program dp_test3.cpp works just like dp_test2.cpp, except that it includes dataproc_good.h, and compiles with dataproc_good.o:
UNIX> make dp_test3 g++ -c dp_test3.cpp g++ -o dp_test3 dp_test3.o dataproc_good.o UNIX> dp_test3 Adding 1, 2, 3, 4 and 5 Nentries: 5 Min: 1 Max: 5 Mean: 3 Variance: 2 Stddev: 1.41421 UNIX>Does the implementation really make a difference? Well try adding 1,000,000 doubles and calculating the mean, variance and standard deviation. This is on my 2.16 GHz Macbook Pro (8/2010).
UNIX> make dp_big_bad dp_big_good g++ -c dp_big_bad.cpp g++ -o dp_big_bad dp_big_bad.o dataproc.o g++ -c dp_big_good.cpp g++ -o dp_big_good dp_big_good.o dataproc_good.o UNIX> time dp_big_bad Nentries: 10000000 Min: 5.97146e-08 Max: 1 Mean: 0.500011 Variance: 0.0833461 Stddev: 0.288697 6.540u 0.248s 0:06.81 99.5% 0+0k 0+0io 0pf+0w UNIX> time dp_big_good Nentries: 10000000 Min: 5.97146e-08 Max: 1 Mean: 0.500011 Variance: 0.0833461 Stddev: 0.288697 0.852u 0.009s 0:00.88 96.5% 0+0k 0+0io 0pf+0w UNIX>Yes, it makes a big difference!