Return to Snippet

Revision: 17583
at September 10, 2009 11:54 by manatlan


Updated Code
from htmlentitydefs import name2codepoint as n2cp
import re

def substitute_entity(match):
    ent = match.group(3)
    if match.group(1) == "#":
        if match.group(2) == '':
            return unichr(int(ent))
        elif match.group(2) == 'x':
            return unichr(int('0x'+ent, 16))
    else:
        cp = n2cp.get(ent)
        if cp:
            return unichr(cp)
        else:
            return match.group()

def decode_htmlentities(string):
    entity_re = re.compile(r'&(#?)(x?)(\w+);')
    return entity_re.subn(substitute_entity, string)[0]

Revision: 17582
at September 10, 2009 03:11 by manatlan


Initial Code
from htmlentitydefs import name2codepoint as n2cp
import re

def substitute_entity(match):
    ent = match.group(2)
    if match.group(1) == "#":
        return unichr(int(ent))
    else:
        cp = n2cp.get(ent)

        if cp:
            return unichr(cp)
        else:
            return match.group()

def decode_htmlentities(string):
    entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});")
    return entity_re.subn(substitute_entity, string)[0]

Initial URL

                                

Initial Description
ex : decode_htmlentities("l'eau")

Initial Title
decode html entities

Initial Tags
html, python, text

Initial Language
Python