/* Messing around with a unicode container that isn't such a pain */ #include #include #include #include #include #include using namespace std; typedef vector group; // group of 8 bit unsigned chars typedef vector ustr; void printGroup(const group& g, ostream& file) { copy( g.begin(), g.end(), ostream_iterator(file,"") ); } void printUstr( const ustr& u, ostream& file ) { for ( ustr::const_iterator i = u.begin(); i != u.end(); ++i ) { printGroup( *i, file ); } } int main() { /* UTF-16BE */ group bom; bom.push_back( 0xFE ); bom.push_back( 0xFF ); group radic; radic.push_back( 0x22 ); radic.push_back( 0x1A ); ustr content; content.push_back( bom ); content.push_back( radic ); ofstream out( "utf16be.txt", ios::binary ); printUstr( content, out ); /* UTF-32BE */ group bom2; bom2.push_back( 0x00 ); bom2.push_back( 0x00 ); bom2.push_back( 0xFE ); bom2.push_back( 0xFF ); group radic2; radic2.push_back( 0x00 ); radic2.push_back( 0x00 ); radic2.push_back( 0x22 ); radic2.push_back( 0x1A ); ustr content2; content2.push_back( bom2 ); content2.push_back( radic2 ); ofstream out2( "utf32be.txt", ios::binary ); printUstr( content2, out2 ); /* UTF-8 with signature */ group sig; sig.push_back( 0xEF ); sig.push_back( 0xBB ); sig.push_back( 0xBF ); group radic3; radic3.push_back( 0xE2 ); radic3.push_back( 0x88 ); radic3.push_back( 0x9A ); ustr content3; content3.push_back( sig ); content3.push_back( radic3 ); ofstream out3( "utf8withsig.txt", ios::binary ); printUstr( content3, out3 ); /* UTF-8 without signature */ ofstream out4( "utf8withoutsig.txt", ios::binary ); printGroup( content3[1], out4 ); /* ascii ( silly, but .. ) */ group a; a.push_back( 'd' ); group b; b.push_back( 'o' ); group c; c.push_back( 'g' ); ustr content4; content4.push_back( a ); content4.push_back( b ); content4.push_back( c ); ofstream out5( "ascii.txt" ); printUstr( content4, out5 ); /* UTF16LE */ group bom3; bom3.push_back( 0xFF ); bom3.push_back( 0xFE ); group radic4; radic4.push_back( 0x1A ); radic4.push_back( 0x22 ); ustr content5; content5.push_back( bom3 ); content5.push_back( radic4 ); ofstream out6( "utf16le.txt", ios::binary ); printUstr( content5, out6 ); /* UTF32LE */ group bom4; bom4.push_back( 0xFF ); bom4.push_back( 0xFE ); bom4.push_back( 0x00 ); bom4.push_back( 0x00 ); group radic5; radic5.push_back( 0x1A ); radic5.push_back( 0x22 ); radic5.push_back( 0x00 ); radic5.push_back( 0x00 ); ustr content6; content6.push_back( bom4 ); content6.push_back( radic5 ); ofstream out7( "utf32le.txt", ios::binary ); printUstr( content6, out7 ); /* UTF16BE with combining character */ group radic6; radic6.push_back( 0x22 ); radic6.push_back( 0x1A ); /* accent */ radic6.push_back( 0x03 ); radic6.push_back( 0x01 ); ustr content7; content7.push_back( bom ); content7.push_back( radic6 ); ofstream out8( "utf16becombining.txt", ios::binary ); printUstr( content7, out8 ); /* Basically, all bytes for a glyph are contained in their own group and can be accessed individually as opposed to computing the desired byte. Since they are grouped together, you can iterate through a ustr glyph by glyph as opposed to each byte. chars can be stored in either endianness chars are stored as chars, so no casting from int to char for file ops Since everything is just a bunch of bytes, writing to a file is simple and the encoding can be whatever. */ }