import unicodedata, sys
# Translation dictionary. Translation entries are added to this
# dictionary as needed.
CHAR_REPLACEMENT = {
# latin-1 characters that don't have a unicode decomposition
0xc6: u"AE", # LATIN CAPITAL LETTER AE
0xd0: u"D", # LATIN CAPITAL LETTER ETH
0xd8: u"OE", # LATIN CAPITAL LETTER O WITH STROKE
0xde: u"Th", # LATIN CAPITAL LETTER THORN
0xdf: u"ss", # LATIN SMALL LETTER SHARP S
0xe6: u"ae", # LATIN SMALL LETTER AE
0xf0: u"d", # LATIN SMALL LETTER ETH
0xf8: u"oe", # LATIN SMALL LETTER O WITH STROKE
0xfe: u"th", # LATIN SMALL LETTER THORN
0x2018: u"'", # LEFT SINGLE QUOTATION MARK
0x2019: u"'", # RIGHT SINGLE QUOTATION MARK
0x201c: u'"', # LEFT DOUBLE QUOTATION MARK
0x201d: u'"', # RIGHT DOUBLE QUOTATION MARK
0x215D: u"5/8", # VULGAR FRACTION FIVE EIGHTHS
0x215A: u"5/6", # VULGAR FRACTION FIVE SIXTHS
0x2158: u"4/5", # VULGAR FRACTION FOUR FIFTHS
0x215B: u"1/8", # VULGAR FRACTION ONE EIGHTH
0x2155: u"1/5", # VULGAR FRACTION ONE FIFTH
0x00BD: u"1/2", # VULGAR FRACTION ONE HALF
0x00BC: u"1/4", # VULGAR FRACTION ONE QUARTER
0x2159: u"1/6", # VULGAR FRACTION ONE SIXTH
0x2153: u"1/3", # VULGAR FRACTION ONE THIRD
0x215E: u"7/8", # VULGAR FRACTION SEVEN EIGHTHS
0x215C: u"3/8", # VULGAR FRACTION THREE EIGHTHS
0x2157: u"3/5", # VULGAR FRACTION THREE FIFTHS
0x00BE: u"3/4", # VULGAR FRACTION THREE QUARTERS
0x2156: u"2/5", # VULGAR FRACTION TWO FIFTHS
0x2154: u"2/3", # VULGAR FRACTION TWO THIRDS
}
class unaccented_map(dict):
"""
Maps a unicode character code (the key) to a replacement code
(either a character code or a unicode string).
"""
def mapchar(self, key):
ch = self.get(key)
if ch is not None:
return ch
de = unicodedata.decomposition(unichr(key))
if key not in CHAR_REPLACEMENT and de:
try:
ch = int(de.split(None, 1)[0], 16)
except (IndexError, ValueError):
ch = key
else:
ch = CHAR_REPLACEMENT.get(key, key)
self[key] = ch
return ch
if sys.version >= "2.5":
# use __missing__ where available
__missing__ = mapchar
else:
# otherwise, use standard __getitem__ hook (this is slower,
# since it's called for each character)
__getitem__ = mapchar
def unicode_to_ascii(unicodestring):
"""
Convert a unicode string into an ASCII representation, converting non-ascii
characters into close approximations where possible.
Special thanks to http://effbot.org/zone/unicode-convert.htm
@param Unicode String unicodestring The string to translate
@result String
"""
charmap = unaccented_map()
return unicodestring.translate(charmap).encode("ascii", "ignore")
Comments
You can also use the snippet I posted here:
http://www.djangosnippets.org/snippets/556/
to convert all non-ASCII characters to their HTML entities equivalent to solve this issue more elegantly.
#
Converting to HTML entities is fine if you are sending the resulting text to something that will decode it for the user. If you passing raw text, seeing [HTML_REMOVED]Hello[HTML_REMOVED] looks a bit odd to people compared to: "Hello".
#
Thanks so much for this snippet! I had hacked together a crude string-replacement script to achieve this, but your solution is much more elegant. Just two things i'd add:
at the beginning of the unicode_to_ascii function, i added a 'unicodestring = unicode(unicodestring)' to also catch regular strings that might have unicode characters.
i also added some entries to the translation dict to account for portuguese accented characters, as well as the cedilla (รง):
0xe0: u'a', 0xe1: u'a', 0xe3: u'a', 0xe8: u'e', 0xe9: u'e', 0xea: u'e', 0xec: u'i', 0xed: u'i', 0xf3: u'o', 0xf2: u'o', 0xf5: u'o', 0xfa: u'u', 0xf9: u'u', 0xe7: u'c',
Thanks again!
#