import unicodedata, sys # Translation dictionary. Translation entries are added to this # dictionary as needed. CHAR_REPLACEMENT = { # latin-1 characters that don't have a unicode decomposition 0xc6: u"AE", # LATIN CAPITAL LETTER AE 0xd0: u"D", # LATIN CAPITAL LETTER ETH 0xd8: u"OE", # LATIN CAPITAL LETTER O WITH STROKE 0xde: u"Th", # LATIN CAPITAL LETTER THORN 0xdf: u"ss", # LATIN SMALL LETTER SHARP S 0xe6: u"ae", # LATIN SMALL LETTER AE 0xf0: u"d", # LATIN SMALL LETTER ETH 0xf8: u"oe", # LATIN SMALL LETTER O WITH STROKE 0xfe: u"th", # LATIN SMALL LETTER THORN 0x2018: u"'", # LEFT SINGLE QUOTATION MARK 0x2019: u"'", # RIGHT SINGLE QUOTATION MARK 0x201c: u'"', # LEFT DOUBLE QUOTATION MARK 0x201d: u'"', # RIGHT DOUBLE QUOTATION MARK 0x215D: u"5/8", # VULGAR FRACTION FIVE EIGHTHS 0x215A: u"5/6", # VULGAR FRACTION FIVE SIXTHS 0x2158: u"4/5", # VULGAR FRACTION FOUR FIFTHS 0x215B: u"1/8", # VULGAR FRACTION ONE EIGHTH 0x2155: u"1/5", # VULGAR FRACTION ONE FIFTH 0x00BD: u"1/2", # VULGAR FRACTION ONE HALF 0x00BC: u"1/4", # VULGAR FRACTION ONE QUARTER 0x2159: u"1/6", # VULGAR FRACTION ONE SIXTH 0x2153: u"1/3", # VULGAR FRACTION ONE THIRD 0x215E: u"7/8", # VULGAR FRACTION SEVEN EIGHTHS 0x215C: u"3/8", # VULGAR FRACTION THREE EIGHTHS 0x2157: u"3/5", # VULGAR FRACTION THREE FIFTHS 0x00BE: u"3/4", # VULGAR FRACTION THREE QUARTERS 0x2156: u"2/5", # VULGAR FRACTION TWO FIFTHS 0x2154: u"2/3", # VULGAR FRACTION TWO THIRDS } class unaccented_map(dict): """ Maps a unicode character code (the key) to a replacement code (either a character code or a unicode string). """ def mapchar(self, key): ch = self.get(key) if ch is not None: return ch de = unicodedata.decomposition(unichr(key)) if key not in CHAR_REPLACEMENT and de: try: ch = int(de.split(None, 1)[0], 16) except (IndexError, ValueError): ch = key else: ch = CHAR_REPLACEMENT.get(key, key) self[key] = ch return ch if sys.version >= "2.5": # use __missing__ where available __missing__ = mapchar else: # otherwise, use standard __getitem__ hook (this is slower, # since it's called for each character) __getitem__ = mapchar def unicode_to_ascii(unicodestring): """ Convert a unicode string into an ASCII representation, converting non-ascii characters into close approximations where possible. Special thanks to http://effbot.org/zone/unicode-convert.htm @param Unicode String unicodestring The string to translate @result String """ charmap = unaccented_map() return unicodestring.translate(charmap).encode("ascii", "ignore")