Converting HTML Entities to Characters with Javascript
Article posted by Joel Moss on 29 Jun 2010   |  

So I use Ajax quite a bit in many of my projects. And why not? It kicks ass! I have one project which dynamically updates specific elements of an HTML page via Ajax. Something like this:

$.get('/travel_guide/Paris', function(data){
  $('#advice').text(data);
});

The problem I have, is that sometimes that data can include HTML entities like & and >, and for reasons I don't want to bore you with, I cannot do anything about it on the server side. When inserting a block of text that contain HTML entities into the DOM, those entities are not parsed as such, and get printed exactly as they are.

What I need is a Javascript function that can take a string and parse it for any HTML entities, then replace each one with their HTML character. And this is what I came up with...

function entityToHtml(string) {
	for (var i in entity_table) {
		if (i != 38) {
			string = string.replace(new RegExp(entity_table[i], "g"), String.fromCharCode(i));
		}
	}
	string = string.replace(new RegExp("&#(x?)(\\d+);", "g"), String.fromCharCode(((p1 == 'x') ? parseInt(p2, 16) : p2)));
	string = string.replace(new RegExp(entity_table[38], "g"), String.fromCharCode(38));
	return string;
}

var entity_table = {
  //	34: """,		// Quotation mark. Not required
  38: "&",		// Ampersand. Applied before everything else in the application
  60: "<",		// Less-than sign
  62: ">",		// Greater-than sign
  //	63: "?",		// Question mark
  //	111: "o",		// Latin small letter o
  160: " ",		// Non-breaking space
  161: "¡",		// Inverted exclamation mark
  162: "¢",		// Cent sign
  163: "£",		// Pound sign
  164: "¤",	// Currency sign
  165: "¥",		// Yen sign
  166: "¦",	// Broken vertical bar
  167: "§",		// Section sign
  168: "¨",		// Diaeresis
  169: "©",		// Copyright sign
  170: "ª",		// Feminine ordinal indicator
  171: "«",		// Left-pointing double angle quotation mark
  172: "¬",		// Not sign
  173: "­",		// Soft hyphen
  174: "®",		// Registered sign
  175: "¯",		// Macron
  176: "°",		// Degree sign
  177: "±",	// Plus-minus sign
  178: "²",		// Superscript two
  179: "³",		// Superscript three
  180: "´",		// Acute accent
  181: "µ",		// Micro sign
  182: "¶",		// Pilcrow sign
  183: "·",	// Middle dot
  184: "¸",		// Cedilla
  185: "¹",		// Superscript one
  186: "º",		// Masculine ordinal indicator
  187: "»",		// Right-pointing double angle quotation mark
  188: "¼",	// Vulgar fraction one-quarter
  189: "½",	// Vulgar fraction one-half
  190: "¾",	// Vulgar fraction three-quarters
  191: "¿",	// Inverted question mark
  192: "À",	// A with grave
  193: "Á",	// A with acute
  194: "Â",		// A with circumflex
  195: "Ã",	// A with tilde
  196: "Ä",		// A with diaeresis
  197: "Å",		// A with ring above
  198: "Æ",		// AE
  199: "Ç",	// C with cedilla
  200: "È",	// E with grave
  201: "É",	// E with acute
  202: "Ê",		// E with circumflex
  203: "Ë",		// E with diaeresis
  204: "Ì",	// I with grave
  205: "Í",	// I with acute
  206: "Î",		// I with circumflex
  207: "Ï",		// I with diaeresis
  208: "Ð",		// Eth
  209: "Ñ",	// N with tilde
  210: "Ò",	// O with grave
  211: "Ó",	// O with acute
  212: "Ô",		// O with circumflex
  213: "Õ",	// O with tilde
  214: "Ö",		// O with diaeresis
  215: "×",		// Multiplication sign
  216: "Ø",	// O with stroke
  217: "Ù",	// U with grave
  218: "Ú",	// U with acute
  219: "Û",		// U with circumflex
  220: "Ü",		// U with diaeresis
  221: "Ý",	// Y with acute
  222: "Þ",		// Thorn
  223: "ß",		// Sharp s. Also known as ess-zed
  224: "à",	// a with grave
  225: "á",	// a with acute
  226: "â",		// a with circumflex
  227: "ã",	// a with tilde
  228: "ä",		// a with diaeresis
  229: "å",		// a with ring above
  230: "æ",		// ae. Also known as ligature ae
  231: "ç",	// c with cedilla
  232: "è",	// e with grave
  233: "é",	// e with acute
  234: "ê",		// e with circumflex
  235: "ë",		// e with diaeresis
  236: "ì",	// i with grave
  237: "í",	// i with acute
  238: "î",		// i with circumflex
  239: "ï",		// i with diaeresis
  240: "ð",		// eth
  241: "ñ",	// n with tilde
  242: "ò",	// o with grave
  243: "ó",	// o with acute
  244: "ô",		// o with circumflex
  245: "õ",	// o with tilde
  246: "ö",		// o with diaeresis
  247: "÷",	// Division sign
  248: "ø",	// o with stroke. Also known as o with slash
  249: "ù",	// u with grave
  250: "ú",	// u with acute
  251: "û",		// u with circumflex
  252: "ü",		// u with diaeresis
  253: "ý",	// y with acute
  254: "þ",		// thorn
  255: "ÿ",		// y with diaeresis
  264: "Ĉ",		// Latin capital letter C with circumflex
  265: "ĉ",		// Latin small letter c with circumflex
  338: "Œ",		// Latin capital ligature OE
  339: "œ",		// Latin small ligature oe
  352: "Š",	// Latin capital letter S with caron
  353: "š",	// Latin small letter s with caron
  372: "Ŵ",		// Latin capital letter W with circumflex
  373: "ŵ",		// Latin small letter w with circumflex
  374: "Ŷ",		// Latin capital letter Y with circumflex
  375: "ŷ",		// Latin small letter y with circumflex
  376: "Ÿ",		// Latin capital letter Y with diaeresis
  402: "ƒ",		// Latin small f with hook, function, florin
  710: "ˆ",		// Modifier letter circumflex accent
  732: "˜",		// Small tilde
  913: "Α",		// Alpha
  914: "Β",		// Beta
  915: "Γ",		// Gamma
  916: "Δ",		// Delta
  917: "Ε",	// Epsilon
  918: "Ζ",		// Zeta
  919: "Η",		// Eta
  920: "Θ",		// Theta
  921: "Ι",		// Iota
  922: "Κ",		// Kappa
  923: "Λ",	// Lambda
  924: "Μ",		// Mu
  925: "Ν",		// Nu
  926: "Ξ",		// Xi
  927: "Ο",	// Omicron
  928: "Π",		// Pi
  929: "Ρ",		// Rho
  931: "Σ",		// Sigma
  932: "Τ",		// Tau
  933: "Υ",	// Upsilon
  934: "Φ",		// Phi
  935: "Χ",		// Chi
  936: "Ψ",		// Psi
  937: "Ω",		// Omega
  945: "α",		// alpha
  946: "β",		// beta
  947: "γ",		// gamma
  948: "δ",		// delta
  949: "ε",	// epsilon
  950: "ζ",		// zeta
  951: "η",		// eta
  952: "θ",		// theta
  953: "ι",		// iota
  954: "κ",		// kappa
  955: "λ",	// lambda
  956: "μ",		// mu
  957: "ν",		// nu
  958: "ξ",		// xi
  959: "ο",	// omicron
  960: "π",		// pi
  961: "ρ",		// rho
  962: "ς",	// sigmaf
  963: "σ",		// sigma
  964: "τ",		// tau
  965: "υ",	// upsilon
  966: "φ",		// phi
  967: "χ",		// chi
  968: "ψ",		// psi
  969: "ω",		// omega
  977: "ϑ",	// Theta symbol
  978: "ϒ",		// Greek upsilon with hook symbol
  982: "ϖ",		// Pi symbol
  8194: " ",		// En space
  8195: " ",		// Em space
  8201: " ",	// Thin space
  8204: "‌",		// Zero width non-joiner
  8205: "‍",		// Zero width joiner
  8206: "‎",		// Left-to-right mark
  8207: "‏",		// Right-to-left mark
  8211: "–",	// En dash
  8212: "—",	// Em dash
  8216: "‘",	// Left single quotation mark
  8217: "’",	// Right single quotation mark
  8218: "‚",	// Single low-9 quotation mark
  8220: "“",	// Left double quotation mark
  8221: "”",	// Right double quotation mark
  8222: "„",	// Double low-9 quotation mark
  8224: "†",	// Dagger
  8225: "‡",	// Double dagger
  8226: "•",		// Bullet
  8230: "…",	// Horizontal ellipsis
  8240: "‰",	// Per mille sign
  8242: "′",	// Prime
  8243: "″",	// Double Prime
  8249: "‹",	// Single left-pointing angle quotation
  8250: "›",	// Single right-pointing angle quotation
  8254: "‾",	// Overline
  8260: "⁄",	// Fraction Slash
  8364: "€",		// Euro sign
  8472: "℘",	// Script capital
  8465: "ℑ",	// Blackletter capital I
  8476: "ℜ",		// Blackletter capital R
  8482: "™",	// Trade mark sign
  8501: "ℵ",	// Alef symbol
  8592: "←",		// Leftward arrow
  8593: "↑",		// Upward arrow
  8594: "→",		// Rightward arrow
  8595: "↓",		// Downward arrow
  8596: "↔",		// Left right arrow
  8629: "↵",	// Downward arrow with corner leftward. Also known as carriage return
  8656: "⇐",		// Leftward double arrow. ISO 10646 does not say that lArr is the same as the 'is implied by' arrow but also does not have any other character for that function. So ? lArr can be used for 'is implied by' as ISOtech suggests
  8657: "⇑",		// Upward double arrow
  8658: "⇒",		// Rightward double arrow. ISO 10646 does not say this is the 'implies' character but does not have another character with this function so ? rArr can be used for 'implies' as ISOtech suggests
  8659: "⇓",		// Downward double arrow
  8660: "⇔",		// Left-right double arrow
  // Mathematical Operators
  8704: "∀",	// For all
  8706: "∂",		// Partial differential
  8707: "∃",	// There exists
  8709: "∅",	// Empty set. Also known as null set and diameter
  8711: "∇",	// Nabla. Also known as backward difference
  8712: "∈",		// Element of
  8713: "∉",	// Not an element of
  8715: "∋",		// Contains as member
  8719: "∏",		// N-ary product. Also known as product sign. Prod is not the same character as U+03A0 'greek capital letter pi' though the same glyph might be used for both
  8721: "∑",		// N-ary summation. Sum is not the same character as U+03A3 'greek capital letter sigma' though the same glyph might be used for both
  8722: "−",	// Minus sign
  8727: "∗",	// Asterisk operator
  8729: "∙",	// Bullet operator
  8730: "√",	// Square root. Also known as radical sign
  8733: "∝",		// Proportional to
  8734: "∞",	// Infinity
  8736: "∠",		// Angle
  8743: "∧",		// Logical and. Also known as wedge
  8744: "∨",		// Logical or. Also known as vee
  8745: "∩",		// Intersection. Also known as cap
  8746: "∪",		// Union. Also known as cup
  8747: "∫",		// Integral
  8756: "∴",	// Therefore
  8764: "∼",		// tilde operator. Also known as varies with and similar to. The tilde operator is not the same character as the tilde, U+007E, although the same glyph might be used to represent both
  8773: "≅",		// Approximately equal to
  8776: "≈",	// Almost equal to. Also known as asymptotic to
  8800: "≠",		// Not equal to
  8801: "≡",	// Identical to
  8804: "≤",		// Less-than or equal to
  8805: "≥",		// Greater-than or equal to
  8834: "⊂",		// Subset of
  8835: "⊃",		// Superset of. Note that nsup, 'not a superset of, U+2283' is not covered by the Symbol font encoding and is not included.
  8836: "⊄",		// Not a subset of
  8838: "⊆",		// Subset of or equal to
  8839: "⊇",		// Superset of or equal to
  8853: "⊕",	// Circled plus. Also known as direct sum
  8855: "⊗",	// Circled times. Also known as vector product
  8869: "⊥",		// Up tack. Also known as orthogonal to and perpendicular
  8901: "⋅",		// Dot operator. The dot operator is not the same character as U+00B7 middle dot
  // Miscellaneous Technical
  8968: "⌈",	// Left ceiling. Also known as an APL upstile
  8969: "⌉",	// Right ceiling
  8970: "⌊",	// left floor. Also known as APL downstile
  8971: "⌋",	// Right floor
  9001: "⟨",		// Left-pointing angle bracket. Also known as bra. Lang is not the same character as U+003C 'less than'or U+2039 'single left-pointing angle quotation mark'
  9002: "⟩",		// Right-pointing angle bracket. Also known as ket. Rang is not the same character as U+003E 'greater than' or U+203A 'single right-pointing angle quotation mark'
  // Geometric Shapes
  9642: "▪",	// Black small square
  9643: "▫",	// White small square
  9674: "◊",		// Lozenge
  // Miscellaneous Symbols
  9702: "◦",	// White bullet
  9824: "♠",	// Black (filled) spade suit
  9827: "♣",	// Black (filled) club suit. Also known as shamrock
  9829: "♥",	// Black (filled) heart suit. Also known as shamrock
  9830: "♦"   // Black (filled) diamond suit
}

Unfortunately, there is no way to convert these entities without a big long list to look through. But it works nicely like so:

entityToHtml(string_with_entities);

Making my code now looking like this:

$.get('/travel_guide/Paris', function(data){
  $('#advice').text(entityToHtml(data));
});

This site contains the musings of Joel Moss, and is powered by Codaset pages; a simple, yet powerful way to host your static site. Just commit and push your site to your free Git repository at Codaset, and that's it!

Tell me more about Joel »