Tag Archives: utf-8

Javascript snippet to convert raw UTF8 to unicode

For the I-don’t-a-sane-use-for-this department comes this piece of code which takes a stream of raw UTF-8 bytes, decodes it and fromCharCode it, rendering it in a unicode supported browser. A possible use would be if the web page character set is not UTF-8 and you want to display UTF-8. To use it, just put it in a script tag and call utf8decode(myrawutf8string). But seriously, all web pages should be UTF-8 by default nowadays. Here it is, in case anyone wants it:

function TryGetCharUTF8(c, intc, b, i, count)
		{
			/*
			 * 10000000 80
			 * 11000000 C0
			 * 11100000 E0
			 * 11110000 F0
			 * 11111000 F8
			 * 11111100 FC
			 * 
			 * FEFF = 65279 = BOM
			 * 
			 * string musicalbassclef = "" + (char)0xD834 + (char)0xDD1E; 119070 0x1D11E
			 */

			if ((b.charCodeAt(i) & 0x80) == 0)
			{
				intc = b.charCodeAt(i);
			}
			else
			{
				if ((b.charCodeAt(i) & 0xE0) == 0xC0)
				{
					//if (i+1 >= count) return false;
					intc = ((b.charCodeAt(i) & 0x1F) << 6) | ((b.charCodeAt(i + 1) & 0x3F));
					
					i += 1;
				}
				else if ((b.charCodeAt(i) & 0xF0) == 0xE0)
				{
					// 3 bytes Covers the rest of the BMP
					//if (i+2 >= count) return false;
					intc = ((b.charCodeAt(i) & 0xF) << 12) | ((b.charCodeAt(i + 1) & 0x3F) << 6) | ((b.charCodeAt(i + 2) & 0x3F));
					alert(b.charCodeAt(i) + ' '+b.charCodeAt(i + 1) +' '+b.charCodeAt(i + 2));
					i += 2;
				}
				else if ((b.charCodeAt(i) & 0xF8) == 0xF0)
				{
					intc = ((b.charCodeAt(i) & 0x7) << 18) | ((b.charCodeAt(i + 1) & 0x3F) << 12) | ((b.charCodeAt(i + 2) & 0x3F) << 6) | ((b.charCodeAt(i + 3) & 0x3F));
					
					i += 1;
				}
				else
					return false;
			}
window.utf8_out_intc = intc;
window.utf8_out_i = i;
			return true;
		}

function utf8decode(s) {
	var ss = "";
	for(utf8_out_i = 0; utf8_out_i < s.length; utf8_out_i++) {
		TryGetCharUTF8(window.utf8_out_c, window.utf8_out_intc, s, window.utf8_out_i, s.length);
		ss += String.fromCharCode(window.utf8_out_intc);
	}
	return ss;
}