/*
* utf8tocp Copyright (C) Mateusz Viste 2013
* converts UTF-8 text files to other codepages, as well as the other way around.
*
* This program is free software: you can redistribute it and/or modify it under
* it under the terms of the GNU General Public License as published by the Free
* Software Foundation, either version 3 of the License, or (at your option) any
* later version.
*/
#include <stdio.h>
#include <string.h>
#define pVer "0.9"
#define pDate "2013"
static void about(void) {
puts("utf8tocp v" pVer " Copyright (C) Mateusz Viste " pDate);
puts("utf8tocp is a tool able to convert UTF-8 text files into other codepages, as");
puts("well as the other way around.");
puts("");
puts("Usage: utf8tocp [-r] encid file.txt");
puts("");
puts(" where encid is the identifier of the target encoding, from the list below:");
puts(" 437 - Original IBM PC codepage 437");
puts(" 850 - CP 850 (a.k.a. 'Latin1')");
puts(" 1252 - Windows-1252 codepage");
puts(" kam - Kamenicky encoding (a.k.a. CP867 or CP895)");
puts(" maz - Mazovia (a.k.a. CP667, CP790 or CP991)");
puts("");
puts(" -r if specified, reverses the conversion (codepage -> UTF-8).");
puts("");
puts("This program is free software: you can redistribute it and/or modify it under");
puts("it under the terms of the GNU General Public License as published by the Free");
puts("Software Foundation, either version 3 of the License, or (at your option) any");
puts("later version.");
}
static int loadlookuptable(char *cpname, long *lookuptable) {
/* load appropriate encoding (difference with CP437) */
if (strcmp(cpname, "437") == 0) { /* IBM CP437 */
return(0);
} else if (strcmp(cpname, "850") == 0) { /* 850 codepage */
lookuptable[155 - 128] = 0x00F8;
lookuptable[157 - 128] = 0x00D8;
lookuptable[158 - 128] = 0x00D7;
lookuptable[169 - 128] = 0x00AE;
lookuptable[181 - 128] = 0x00C1;
lookuptable[182 - 128] = 0x00C2;
lookuptable[183 - 128] = 0x00C0;
lookuptable[184 - 128] = 0x00A9;
lookuptable[189 - 128] = 0x00A2;
lookuptable[190 - 128] = 0x00A5;
lookuptable[198 - 128] = 0x00E3;
lookuptable[199 - 128] = 0x00C3;
lookuptable[207 - 128] = 0x00A4;
lookuptable[208 - 128] = 0x00F0;
lookuptable[209 - 128] = 0x00D0;
lookuptable[210 - 128] = 0x00CA;
lookuptable[211 - 128] = 0x00CB;
lookuptable[212 - 128] = 0x00C8;
lookuptable[213 - 128] = 0x0131;
lookuptable[214 - 128] = 0x00CD;
lookuptable[215 - 128] = 0x00CE;
lookuptable[216 - 128] = 0x00CF;
lookuptable[221 - 128] = 0x00A6;
lookuptable[222 - 128] = 0x00CC;
lookuptable[224 - 128] = 0x00D3;
lookuptable[226 - 128] = 0x00D4;
lookuptable[227 - 128] = 0x00D2;
lookuptable[228 - 128] = 0x00F5;
lookuptable[229 - 128] = 0x00D5;
lookuptable[231 - 128] = 0x00FE;
lookuptable[232 - 128] = 0x00DE;
lookuptable[233 - 128] = 0x00DA;
lookuptable[234 - 128] = 0x00DB;
lookuptable[235 - 128] = 0x00D9;
lookuptable[236 - 128] = 0x00FD;
lookuptable[237 - 128] = 0x00DD;
lookuptable[238 - 128] = 0x00AF;
lookuptable[239 - 128] = 0x00B4;
lookuptable[240 - 128] = 0x00AD;
lookuptable[242 - 128] = 0x2017;
lookuptable[243 - 128] = 0x00BE;
lookuptable[244 - 128] = 0x00B6;
lookuptable[245 - 128] = 0x00A7;
lookuptable[247 - 128] = 0x00B8;
lookuptable[249 - 128] = 0x00A8;
lookuptable[251 - 128] = 0x00B9;
lookuptable[252 - 128] = 0x00B3;
return(0);
} else if (strcmp(cpname, "1252") == 0) { /* Windows-1252 codepage */
lookuptable[128 - 128] = 0x20AC;
lookuptable[129 - 128] = '?';
lookuptable[130 - 128] = 0x201A;
lookuptable[131 - 128] = 0x0192;
lookuptable[132 - 128] = 0x201E;
lookuptable[133 - 128] = 0x2026;
lookuptable[134 - 128] = 0x2020;
lookuptable[135 - 128] = 0x2021;
lookuptable[136 - 128] = 0x02C6;
lookuptable[137 - 128] = 0x2030;
lookuptable[138 - 128] = 0x0160;
lookuptable[139 - 128] = 0x2039;
lookuptable[140 - 128] = 0x0152;
lookuptable[141 - 128] = '?';
lookuptable[142 - 128] = 0x017D;
lookuptable[143 - 128] = '?';
lookuptable[144 - 128] = '?';
lookuptable[145 - 128] = 0x2018;
lookuptable[146 - 128] = 0x2019;
lookuptable[147 - 128] = 0x201C;
lookuptable[148 - 128] = 0x201D;
lookuptable[149 - 128] = 0x2022;
lookuptable[150 - 128] = 0x2013;
lookuptable[151 - 128] = 0x2014;
lookuptable[152 - 128] = 0x02DC;
lookuptable[153 - 128] = 0x2122;
lookuptable[154 - 128] = 0x0161;
lookuptable[155 - 128] = 0x203A;
lookuptable[156 - 128] = 0x0153;
lookuptable[157 - 128] = '?';
lookuptable[158 - 128] = 0x017E;
lookuptable[159 - 128] = 0x0178;
lookuptable[160 - 128] = 0x00A0;
lookuptable[161 - 128] = 0x00A1;
lookuptable[162 - 128] = 0x00A2;
lookuptable[163 - 128] = 0x00A3;
lookuptable[164 - 128] = 0x00A4;
lookuptable[165 - 128] = 0x00A5;
lookuptable[166 - 128] = 0x00A6;
lookuptable[167 - 128] = 0x00A7;
lookuptable[168 - 128] = 0x00A8;
lookuptable[169 - 128] = 0x00A9;
lookuptable[170 - 128] = 0x00AA;
lookuptable[171 - 128] = 0x00AB;
lookuptable[172 - 128] = 0x00AC;
lookuptable[173 - 128] = 0x00AD;
lookuptable[174 - 128] = 0x00AE;
lookuptable[175 - 128] = 0x00AF;
lookuptable[176 - 128] = 0x00B0;
lookuptable[177 - 128] = 0x00B1;
lookuptable[178 - 128] = 0x00B2;
lookuptable[179 - 128] = 0x00B3;
lookuptable[180 - 128] = 0x00B4;
lookuptable[181 - 128] = 0x00B5;
lookuptable[182 - 128] = 0x00B6;
lookuptable[183 - 128] = 0x00B7;
lookuptable[184 - 128] = 0x00B8;
lookuptable[185 - 128] = 0x00B9;
lookuptable[186 - 128] = 0x00BA;
lookuptable[187 - 128] = 0x00BB;
lookuptable[188 - 128] = 0x00BC;
lookuptable[189 - 128] = 0x00BD;
lookuptable[190 - 128] = 0x00BE;
lookuptable[191 - 128] = 0x00BF;
lookuptable[192 - 128] = 0x00C0;
lookuptable[193 - 128] = 0x00C1;
lookuptable[194 - 128] = 0x00C2;
lookuptable[195 - 128] = 0x00C3;
lookuptable[196 - 128] = 0x00C4;
lookuptable[197 - 128] = 0x00C5;
lookuptable[198 - 128] = 0x00C6;
lookuptable[199 - 128] = 0x00C7;
lookuptable[200 - 128] = 0x00C8;
lookuptable[201 - 128] = 0x00C9;
lookuptable[202 - 128] = 0x00CA;
lookuptable[203 - 128] = 0x00CB;
lookuptable[204 - 128] = 0x00CC;
lookuptable[205 - 128] = 0x00CD;
lookuptable[206 - 128] = 0x00CE;
lookuptable[207 - 128] = 0x00CF;
lookuptable[208 - 128] = 0x00D0;
lookuptable[209 - 128] = 0x00D1;
lookuptable[210 - 128] = 0x00D2;
lookuptable[211 - 128] = 0x00D3;
lookuptable[212 - 128] = 0x00D4;
lookuptable[213 - 128] = 0x00D5;
lookuptable[214 - 128] = 0x00D6;
lookuptable[215 - 128] = 0x00D7;
lookuptable[216 - 128] = 0x00D8;
lookuptable[217 - 128] = 0x00D9;
lookuptable[218 - 128] = 0x00DA;
lookuptable[219 - 128] = 0x00DB;
lookuptable[220 - 128] = 0x00DC;
lookuptable[221 - 128] = 0x00DD;
lookuptable[222 - 128] = 0x00DE;
lookuptable[223 - 128] = 0x00DF;
lookuptable[224 - 128] = 0x00E0;
lookuptable[225 - 128] = 0x00E1;
lookuptable[226 - 128] = 0x00E2;
lookuptable[227 - 128] = 0x00E3;
lookuptable[228 - 128] = 0x00E4;
lookuptable[229 - 128] = 0x00E5;
lookuptable[230 - 128] = 0x00E6;
lookuptable[231 - 128] = 0x00E7;
lookuptable[232 - 128] = 0x00E8;
lookuptable[233 - 128] = 0x00E9;
lookuptable[234 - 128] = 0x00EA;
lookuptable[235 - 128] = 0x00EB;
lookuptable[236 - 128] = 0x00EC;
lookuptable[237 - 128] = 0x00ED;
lookuptable[238 - 128] = 0x00EE;
lookuptable[239 - 128] = 0x00EF;
lookuptable[240 - 128] = 0x00F0;
lookuptable[241 - 128] = 0x00F1;
lookuptable[242 - 128] = 0x00F2;
lookuptable[243 - 128] = 0x00F3;
lookuptable[244 - 128] = 0x00F4;
lookuptable[245 - 128] = 0x00F5;
lookuptable[246 - 128] = 0x00F6;
lookuptable[247 - 128] = 0x00F7;
lookuptable[248 - 128] = 0x00F8;
lookuptable[249 - 128] = 0x00F9;
lookuptable[250 - 128] = 0x00FA;
lookuptable[251 - 128] = 0x00FB;
lookuptable[252 - 128] = 0x00FC;
lookuptable[253 - 128] = 0x00FD;
lookuptable[254 - 128] = 0x00FE;
lookuptable[255 - 128] = 0x00FF;
return(0);
} else if (strcmp(cpname, "maz") == 0) { /* MAZOVIA (PL) */
lookuptable[134 - 128] = 0x0105;
lookuptable[141 - 128] = 0x0107;
lookuptable[143 - 128] = 0x0104;
lookuptable[144 - 128] = 0x0118;
lookuptable[145 - 128] = 0x0119;
lookuptable[146 - 128] = 0x0142;
lookuptable[149 - 128] = 0x0106;
lookuptable[152 - 128] = 0x015A;
lookuptable[156 - 128] = 0x0141;
lookuptable[158 - 128] = 0x015B;
lookuptable[160 - 128] = 0x0179;
lookuptable[161 - 128] = 0x017B;
lookuptable[163 - 128] = 0x00D3;
lookuptable[164 - 128] = 0x0144;
lookuptable[165 - 128] = 0x0143;
lookuptable[166 - 128] = 0x017A;
lookuptable[167 - 128] = 0x017C;
return(0);
} else if (strcmp(cpname, "kam") == 0) { /* Kamenicky encoding (CZ) */
lookuptable[128 - 128] = 0x010C;
lookuptable[131 - 128] = 0x010F;
lookuptable[133 - 128] = 0x010E;
lookuptable[134 - 128] = 0x0164;
lookuptable[135 - 128] = 0x010D;
lookuptable[136 - 128] = 0x011B;
lookuptable[137 - 128] = 0x011A;
lookuptable[138 - 128] = 0x0139;
lookuptable[139 - 128] = 0x00CD;
lookuptable[140 - 128] = 0x013E;
lookuptable[141 - 128] = 0x013A;
lookuptable[143 - 128] = 0x00C1;
lookuptable[145 - 128] = 0x017E;
lookuptable[146 - 128] = 0x017D;
lookuptable[149 - 128] = 0x00D3;
lookuptable[150 - 128] = 0x016F;
lookuptable[151 - 128] = 0x00DA;
lookuptable[152 - 128] = 0x00FD;
lookuptable[155 - 128] = 0x0160;
lookuptable[156 - 128] = 0x013D;
lookuptable[157 - 128] = 0x00DD;
lookuptable[158 - 128] = 0x0158;
lookuptable[159 - 128] = 0x0165;
lookuptable[164 - 128] = 0x0148;
lookuptable[165 - 128] = 0x0147;
lookuptable[166 - 128] = 0x016E;
lookuptable[167 - 128] = 0x00D4;
lookuptable[168 - 128] = 0x0161;
lookuptable[169 - 128] = 0x0159;
lookuptable[170 - 128] = 0x0155;
lookuptable[171 - 128] = 0x0154;
lookuptable[173 - 128] = 0x00A7;
return(0);
} else { /* else it's an unknown encoding */
return(-1);
}
}
static int codepagelookup(long codepoint, long *lookuptable) {
int x;
/* if the codepoint is 7bit, don't look further, since it's valid ASCII already */
if (codepoint < 128) return((int)codepoint);
/* values higher than 127 must be mapped */
for (x = 0; x < 128; x++) {
if (lookuptable[x] == codepoint) return(x + 128);
}
/* if the codepoint is not present in the codepage, return a fallback character */
return('?');
}
static long unicodelookup(int bytecode, long *lookuptable) {
/* if the codepoint is 7bit, don't look further, since it's valid ASCII already */
if (bytecode < 128) return(bytecode);
/* values higher than 127 must be mapped */
return(lookuptable[bytecode - 128]);
}
static long getNextUnicodeTokenFromFile(FILE *fd) {
int tmpbyte, bytelen, x;
long result = 0;
/* read the 1st byte - this will tell us how many bytes follow */
result = fgetc(fd);
if (result == EOF) return(EOF);
if ((result & 0x80) == 0) { /* 0xxxxxxx (1 byte) */
bytelen = 1;
} else if ((result & 0xE0) == 0xC0) { /* 110xxxxx (2 bytes) */
bytelen = 2;
result &= 0x1F;
} else if ((result & 0xF0) == 0xE0) { /* 1110xxxx (3 bytes) */
bytelen = 3;
result &= 0xF;
} else if ((result & 0xF8) == 0xF0) { /* 11110xxx (4 bytes) */
bytelen = 4;
result &= 0x7;
} else { /* invalid UTF-8 byte */
return('?');
}
/* read all following bytes */
for (x = 1; x < bytelen; x++) {
tmpbyte = fgetc(fd);
tmpbyte &= 0x3F;
result <<= 6;
result |= tmpbyte;
}
return(result);
}
static void outputUnicodeToken(long codepoint) {
if (codepoint < 0x80) { /* single byte, same as ASCII */
putchar(codepoint);
} else if (codepoint < 0x800l) { /* two bytes */
putchar(0xC0 | ((codepoint >> 6) & 0x1F)); /* 110xxxxx */
putchar(0x80 | (codepoint & 0x3F)); /* 10xxxxxx */
} else if (codepoint < 0x10000l) { /* three bytes */
putchar(0xE0 | ((codepoint >> 12) & 0xF)); /* 1110xxxx */
putchar(0x80 | ((codepoint >> 6) & 0x3F)); /* 10xxxxxx */
putchar(0x80 | (codepoint & 0x3F)); /* 10xxxxxx */
} else if (codepoint < 0x200000l) { /* four bytes */
putchar(0xF0 | ((codepoint >> 18) & 0x7)); /* 11110xxx */
putchar(0x80 | ((codepoint >> 12) & 0x3F)); /* 10xxxxxx */
putchar(0x80 | ((codepoint >> 6) & 0x3F)); /* 10xxxxxx */
putchar(0x80 | (codepoint & 0x3F)); /* 10xxxxxx */
} else { /* unknown stuff */
putchar('?');
}
}
int main(int argc, char **argv) {
/* lookuptable is preloaded with CP437 at start */
long lookuptable[128] = {0x00C7,0x00FC,0x00E9,0x00E2,0x00E4,0x00E0,0x00E5,0x00E7,0x00EA,0x00EB,0x00E8,0x00EF,0x00EE,0x00EC,0x00C4,0x00C5,
0x00C9,0x00E6,0x00C6,0x00F4,0x00F6,0x00F2,0x00FB,0x00F9,0x00FF,0x00D6,0x00DC,0x00A2,0x00A3,0x00A5,0x20A7,0x0192,
0x00E1,0x00ED,0x00F3,0x00FA,0x00F1,0x00D1,0x00AA,0x00BA,0x00BF,0x2310,0x00AC,0x00BD,0x00BC,0x00A1,0x00AB,0x00BB,
0x2591,0x2592,0x2593,0x2502,0x2524,0x2561,0x2562,0x2556,0x2555,0x2563,0x2551,0x2557,0x255D,0x255C,0x255B,0x2510,
0x2514,0x2534,0x252C,0x251C,0x2500,0x253C,0x255E,0x255F,0x255A,0x2554,0x2569,0x2566,0x2560,0x2550,0x256C,0x2567,
0x2568,0x2564,0x2565,0x2559,0x2558,0x2552,0x2553,0x256B,0x256A,0x2518,0x250C,0x2588,0x2584,0x258C,0x2590,0x2580,
0x03B1,0x00DF,0x0393,0x03C0,0x03A3,0x03C3,0x00B5,0x03C4,0x03A6,0x0398,0x03A9,0x03B4,0x221E,0x03C6,0x03B5,0x2229,
0x2261,0x00B1,0x2265,0x2264,0x2320,0x2321,0x00F7,0x2248,0x00B0,0x2219,0x00B7,0x221A,0x207F,0x00B2,0x25A0,0x00A0};
char *filename;
char *cpname;
int revflag = 0;
FILE *fd;
/* I expect the program to be called with exactly two parameters */
if (argc == 3) {
cpname = argv[1];
filename = argv[2];
} else if (argc == 4) {
if (strcmp(argv[1], "-r") == 0) {
revflag = 1;
} else {
puts("ERROR: Unknown option.");
return(3);
}
cpname = argv[2];
filename = argv[3];
} else {
about();
return(0);
}
/* resolve the codepage name to a numerical id */
if (loadlookuptable(cpname, lookuptable) != 0) {
puts("ERROR: Unknown target encoding.");
return(1);
}
/* open the input file */
fd = fopen(filename, "rb");
if (fd == NULL) {
puts("ERROR: Failed to open input file.");
return(2);
}
/* Start converting the file */
if (revflag == 0) { /* normal operation: utf8 -> codepage */
long unicodeToken;
for (;;) {
unicodeToken = getNextUnicodeTokenFromFile(fd);
if (unicodeToken == EOF) break;
putchar(codepagelookup(unicodeToken, lookuptable));
}
} else { /* reverse operation: codepage -> utf8 */
int bytebuff;
for (;;) {
bytebuff = getc(fd);
if (bytebuff == EOF) break;
outputUnicodeToken(unicodelookup(bytebuff, lookuptable));
}
}
/* close the input file */
fclose(fd);
return(0);
}