/* My c++ solution for the c++ vs. perl bakeoff at http://www.hardforum.com/showthread.php?t=1077745. The program takes a MySQL SQL dump file (lines with a comma-separated list of quoted values with \ used to escape certain chars in each value) from Wikipedia and chops it up into a tab-delimited file (lines with a tab-separated list of values). The file used is page.sql.gz from http://download.wikimedia.org/enwiki/20060702/. See http://www.hardforum.com/showpost.php?p=1029721556&postcount=56 for correct results. */ #include #include using namespace std; inline void parseQuotedValue(const string& s, size_t& i, string& out) { while (i < s.size()) { ++i; if (s[i] == '\\' && i + 1 < s.size()) { char h = s[i + 1]; if (h == '\'') { out.push_back('\''); ++i; } else if (h == '\\') { out.push_back('\\'); ++i; } } else if (s[i] == '\'') { break; } else if (s[i] == '_') { out.push_back(' '); } else if (s[i] != ' ') { out.push_back(s[i]); } } } inline void parseLine(const string& s, ofstream& out) { string buffer; for (size_t i = 26; i < s.size(); ++i) { if ( s[i] == '\'' ) { parseQuotedValue(s, i, buffer); } else if (s[i] == ',') { buffer.push_back('\t'); } else if (s[i] == ')') { buffer.push_back('\r'); buffer.push_back('\n'); i += 2; } else if (s[i] != '(') { buffer.push_back(s[i]); } } out << buffer; } int main(int argc, char* argv[]) { if (argc != 3) { return 1; } ifstream in(argv[1], ios::binary); if (!in) { return 1; } ofstream out(argv[2], ios::binary); if (!out) { return 1; } for (string s; getline(in, s); ) { if (s.find("INSERT INTO `page` VALUES") == 0) { parseLine(s, out); } } } // g++ -Wall -Wextra bakeoff-0.cpp -o bakeoff-0 -O3 -s