decode html entities - Python Snipplr Social Repository

Revision: 17583

at September 10, 2009 11:54 by manatlan

Updated Code

from htmlentitydefs import name2codepoint as n2cp
import re

def substitute_entity(match):
    ent = match.group(3)
    if match.group(1) == "#":
        if match.group(2) == '':
            return unichr(int(ent))
        elif match.group(2) == 'x':
            return unichr(int('0x'+ent, 16))
    else:
        cp = n2cp.get(ent)
        if cp:
            return unichr(cp)
        else:
            return match.group()

def decode_htmlentities(string):
    entity_re = re.compile(r'&(#?)(x?)(\w+);')
    return entity_re.subn(substitute_entity, string)[0]

Revision: 17582

at September 10, 2009 03:11 by manatlan

Initial Code

from htmlentitydefs import name2codepoint as n2cp
import re

def substitute_entity(match):
    ent = match.group(2)
    if match.group(1) == "#":
        return unichr(int(ent))
    else:
        cp = n2cp.get(ent)

        if cp:
            return unichr(cp)
        else:
            return match.group()

def decode_htmlentities(string):
    entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});")
    return entity_re.subn(substitute_entity, string)[0]

Initial URL

Initial Description

ex : decode_htmlentities("l&#39;eau")

Initial Title

decode html entities

Initial Tags

html, python, text

Initial Language

Python

Choose a language for easy browsing: