Daily Archives: October 3, 2009

Javascript snippet to convert raw UTF8 to unicode

For the I-don’t-a-sane-use-for-this department comes this piece of code which takes a stream of raw UTF-8 bytes, decodes it and fromCharCode it, rendering it in a unicode supported browser. A possible use would be if the web page character set is not UTF-8 and you want to display UTF-8. To use it, just put it in a script tag and call utf8decode(myrawutf8string). But seriously, all web pages should be UTF-8 by default nowadays. Here it is, in case anyone wants it:

[js]
function TryGetCharUTF8(c, intc, b, i, count)
{
/*
* 10000000 80
* 11000000 C0
* 11100000 E0
* 11110000 F0
* 11111000 F8
* 11111100 FC
*
* FEFF = 65279 = BOM
*
* string musicalbassclef = "" + (char)0xD834 + (char)0xDD1E; 119070 0x1D11E
*/

if ((b.charCodeAt(i) & 0x80) == 0)
{
intc = b.charCodeAt(i);
}
else
{
if ((b.charCodeAt(i) & 0xE0) == 0xC0)
{
//if (i+1 >= count) return false;
intc = ((b.charCodeAt(i) & 0x1F) << 6) | ((b.charCodeAt(i + 1) & 0x3F));

i += 1;
}
else if ((b.charCodeAt(i) & 0xF0) == 0xE0)
{
// 3 bytes Covers the rest of the BMP
//if (i+2 >= count) return false;
intc = ((b.charCodeAt(i) & 0xF) << 12) | ((b.charCodeAt(i + 1) & 0x3F) << 6) | ((b.charCodeAt(i + 2) & 0x3F));
alert(b.charCodeAt(i) + ‘ ‘+b.charCodeAt(i + 1) +’ ‘+b.charCodeAt(i + 2));
i += 2;
}
else if ((b.charCodeAt(i) & 0xF8) == 0xF0)
{
intc = ((b.charCodeAt(i) & 0x7) << 18) | ((b.charCodeAt(i + 1) & 0x3F) << 12) | ((b.charCodeAt(i + 2) & 0x3F) << 6) | ((b.charCodeAt(i + 3) & 0x3F));

i += 1;
}
else
return false;
}
window.utf8_out_intc = intc;
window.utf8_out_i = i;
return true;
}

function utf8decode(s) {
var ss = "";
for(utf8_out_i = 0; utf8_out_i < s.length; utf8_out_i++) {
TryGetCharUTF8(window.utf8_out_c, window.utf8_out_intc, s, window.utf8_out_i, s.length);
ss += String.fromCharCode(window.utf8_out_intc);
}
return ss;
}
[/js]