// // Read a list of data points from a file, each data point a line of n tab-separated or comma-separated doubles. // Do least-squared regression for predicting the last dimension of each data point from the first n-1. // #include #include #include #include #include using namespace std; char separator = '\t'; class MyException : public exception { public: MyException(const char* m) : _msg(m) {} virtual ~MyException() throw() {} virtual const char* what() const throw() { return _msg.c_str(); } protected: string _msg; }; static void Assert( int line, const char* filename, const char* condition, const char* message) { static const int bufSize = 4000; char y[bufSize]; sprintf(y, "line=[%d], file=[%s], condition=[%s]: %s", line, filename, condition, message); throw MyException(y); } #define ASSERT(condition, message) do { \ if (!(condition)) Assert(__LINE__, __FILE__, #condition, message); \ } while(0) void LoadAndSum(int& c, int& n, vector& sums, vector& dots) { // open the file, read the first line static const int c_len = 10000; char line[c_len]; ASSERT(fgets(line, c_len, stdin), "first input line is missing"); // figure out the separator if (!strchr(line, '\t')) { separator = ','; ASSERT(strchr(line, ','), "data points must be separated by tabs or commas"); } // figure out from the first line what n is, fill data[] with the first line char *text = line; vector data; for (n = 0; ; n++) { double d; ASSERT(1 == sscanf(text, "%lf", &d), "missing number"); data.push_back(d); text = strchr(text, separator); if (!text) { break; } ++text; // skip over separator } ASSERT(++n >= 2, "must be at least two columns"); // initialize sums[], dots[] from the first line sums.resize(n); dots.resize(n*n); for (int i = 0; i < n; i++) { sums[i] = data[i]; for (int j = 0; j < n; j++) { dots[n*i + j] = data[i] * data[j]; } } c = 1; // now read the rest of the file while (fgets(line, c_len, stdin)) { text = line; for (int i = 0; i < n; i++) { double q; ASSERT(1 == sscanf(text, "%lf", &q), "missing number"); data[i] = q; text = strchr(text, separator)+1; } // add this line to sums[], dots[] for (int i = 0; i < n; i++) { sums[i] += data[i]; for (int j = 0; j < n; j++) { dots[n*i + j] += data[i] * data[j]; } } ++c; } } // Set up the matrix based on derivatives for each of the n variables. void SetUpDerivatives(const vector& sums, const vector& dots, int c, int n, vector& m) { for (int iRow = 0; iRow < n-1; iRow++) { for (int iCol = 0; iCol < n-1; iCol++) { m[(n+1)*iRow + iCol] = dots[n*iRow + iCol]; } m[(n+1)*iRow + n-1] = sums[iRow]; m[(n+1)*iRow + n] = dots[n*iRow + (n-1)]; } for (int iCol = 0; iCol < n-1; iCol++) { m[(n+1)*(n-1) + iCol] = sums[iCol]; } m[(n+1)*(n-1) + (n-1)] = c; m[(n+1)*(n-1) + n] = sums[n-1]; } double AbsoluteValue(double x) { return x > 0 ? x : -x; } // Apply Gaussian Elimination to the n*(n+1) matrix, leaving solved coefficients in the nth column void GaussianElimination(vector& m, int n) { // eliminate equations to form a diagonal matrix for (int iDiag = 0; iDiag < n; iDiag++) { // find the most nonzero row int bestRow = iDiag; for (int iRow = iDiag+1; iRow < n; iRow++) { if (AbsoluteValue(m[(n+1)*iRow + iDiag]) > AbsoluteValue(m[(n+1)*bestRow + iDiag])) { bestRow = iRow; } } // SOME row has to be nonzero double scale = m[(n+1)*bestRow + iDiag]; ASSERT(AbsoluteValue(m[(n+1)*bestRow + iDiag]) > 0.0, "equations are linearly dependent"); // Swap the rows at bestRow and iDiag. for (int iCol = iDiag; iCol < n+1; iCol++) { double temp = m[(n+1)*iDiag + iCol]; m[(n+1)*iDiag + iCol] = m[(n+1)*bestRow + iCol]; m[(n+1)*bestRow + iCol] = temp; } // Normalize row iDiag so m[iDiag][iDiag] == 1 scale = 1/scale; m[(n+1)*iDiag + iDiag] = 1.0; for (int iCol = iDiag+1; iCol < n+1; iCol++) { m[(n+1)*iDiag + iCol] *= scale; } // Eliminate this equation from all later equations for (int iRow = iDiag+1; iRow < n; iRow++) { scale = m[(n+1)*iRow + iDiag]; m[(n+1)*iRow + iDiag] = 0; for (int iCol = iDiag+1; iCol < n+1; iCol++) { m[(n+1)*iRow + iCol] -= m[(n+1)*iDiag + iCol] * scale; } } } // solve the rest of the matrix, leaving an identity matrix and the solved coefficients for (int iDiag = n-1; iDiag-- > 0; ) { for (int iCol = iDiag+1; iCol < n; iCol++) { m[(n+1)*iDiag + n] -= (m[(n+1)*iDiag + iCol] * m[(n+1)*iCol + n]); m[(n+1)*iDiag + iCol] = 0.0; } } } void Solve(const vector& sums, const vector& dots, int c, int n, vector& coef) { vector matrix; matrix.resize(n*(n+1)); // n rows, n+1 columns, for gaussian elimination SetUpDerivatives(sums, dots, c, n, matrix); GaussianElimination(matrix, n); coef.resize(n); for (int i=0; i& coef) { for (int i = 0; i < n; i++) { if (i > 0) { printf("%c", separator); } printf("%g", coef[i]); } printf("\n"); } void Driver() { int n; // number of columns int c; // count of data points vector sums; // sums[i] is sum(d_i) vector dots; // dots[n*i+j] is sum(d_i*d_j) vector coef; // coefficients for the least-squares linear regression formula LoadAndSum(c, n, sums, dots); Solve(sums, dots, c, n, coef); ShowCoefficients(n, coef); } int main(int argc, const char** argv) { try { ASSERT(argc == 1, "usage: regression < data.txt"); Driver(); return 0; } catch(const exception& ex) { fprintf(stderr, "%s\n", ex.what()); return 1; } }