, , etc.
# $text = $this->strip( $text, $this->mStripState );
# if ( $this->ot['html'] ) {
# $text = Sanitizer::removeHTMLtags( $text, array( &$this, 'replaceVariables' ), $assocArgs );
# } elseif ( $this->ot['pre'] && $this->mOptions->getRemoveComments() ) {
# $text = Sanitizer::removeHTMLcomments( $text );
# }
# }
# $text = $this->replaceVariables( $text, $assocArgs );
#
# # If the template begins with a table or block-level
# # element, it should be treated as beginning a new line.
# if (!$piece['lineStart'] && preg_match('/^({\\||:|;|#|\*)/', $text)) /*}*/{
# $text = "\n" . $text;
# }
# } elseif ( !$noargs ) {
# # $noparse and !$noargs
# # Just replace the arguments, not any double-brace items
# # This is used for rendered interwiki transclusion
# $text = $this->replaceVariables( $text, $assocArgs, true );
# }
# }
# # Prune lower levels off the recursion check path
# $this->mTemplatePath = $lastPathLevel;
#
# if ( $found && !$this->incrementIncludeSize( 'post-expand', strlen( $text ) ) ) {
# # Error, oversize inclusion
# $text = $linestart .
# "[[$titleText]]";
# $noparse = true;
# $noargs = true;
# }
#
# if ( !$found ) {
# wfProfileOut( $fname );
# return $piece['text'];
# } else {
# wfProfileIn( __METHOD__ . '-placeholders' );
# if ( $isHTML ) {
# # Replace raw HTML by a placeholder
# # Add a blank line preceding, to prevent it from mucking up
# # immediately preceding headings
# $text = "\n\n" . $this->insertStripItem( $text, $this->mStripState );
# } else {
# # replace ==section headers==
# # XXX this needs to go away once we have a better parser.
# if ( !$this->ot['wiki'] && !$this->ot['pre'] && $replaceHeadings ) {
# if( !is_null( $title ) )
# $encodedname = base64_encode($title->getPrefixedDBkey());
# else
# $encodedname = base64_encode("");
# $m = preg_split('/(^={1,6}.*?={1,6}\s*?$)/m', $text, -1,
# PREG_SPLIT_DELIM_CAPTURE);
# $text = '';
# $nsec = 0;
# for( $i = 0; $i < count($m); $i += 2 ) {
# $text .= $m[$i];
# if (!isset($m[$i + 1]) || $m[$i + 1] == "") continue;
# $hl = $m[$i + 1];
# if( strstr($hl, "" . $m2[3];
#
# $nsec++;
# }
# }
# }
# wfProfileOut( __METHOD__ . '-placeholders' );
# }
#
# # Prune lower levels off the recursion check path
# $this->mTemplatePath = $lastPathLevel;
#
# if ( !$found ) {
# wfProfileOut( $fname );
# return $piece['text'];
# } else {
# wfProfileOut( $fname );
# return $text;
# }
# }
_guillemetLeftPat = re.compile(ur'(.) (\?|:|;|!|\302\273)', re.UNICODE)
_guillemetRightPat = re.compile(ur'(\302\253) ', re.UNICODE)
def fixtags(text):
"""Clean up special characters, only run once, next-to-last before doBlockLevels"""
# french spaces, last one Guillemet-left
# only if there is something before the space
text = _guillemetLeftPat.sub(ur'\1 \2', text)
# french spaces, Guillemet-right
text = _guillemetRightPat.sub(ur'\1 ', text)
return text
def closeParagraph(mLastSection):
"""Used by doBlockLevels()"""
result = u''
if mLastSection != u'':
result = u'' + mLastSection + u'>\n'
return result
def getCommon(st1, st2):
"""
getCommon() returns the length of the longest common substring
of both arguments, starting at the beginning of both.
"""
fl = len(st1)
shorter = len(st2)
if fl < shorter:
shorter = fl
i = 0
while i < shorter:
if st1[i] != st2[i]:
break
i += 1
return i
def openList(char, mLastSection):
"""
These next three functions open, continue, and close the list
element appropriate to the prefix character passed into them.
"""
result = closeParagraph(mLastSection)
mDTopen = False
if char == u'*':
result += u'
- '
elif char == u'#':
result += u'
- '
elif char == u':':
result += u'
- '
elif char == u';':
result += u'
- '
mDTopen = True
else:
result += u''
return result, mDTopen
def nextItem(char, mDTopen):
if char == u'*' or char == '#':
return u'
- ', None
elif char == u':' or char == u';':
close = u''
if mDTopen:
close = ''
if char == u';':
return close + u'
- ', True
else:
return close + u'
- ', False
return u''
def closeList(char, mDTopen):
if char == u'*':
return u'
\n'
elif char == u'#':
return u'\n'
elif char == u':':
if mDTopen:
return u'\n'
else:
return u'\n'
else:
return u''
_closePrePat = re.compile(u" 0:
tmpOutput, tmpMDTopen = nextItem(pref[commonPrefixLength-1])
output += tmpOutput
if tmpMDTopen is not None:
mDTopen = tmpMDTopen
while prefixLength > commonPrefixLength:
char = pref[commonPrefixLength:commonPrefixLength+1]
tmpOutput, tmpMDTOpen = openList(char, mLastSection)
if tmpMDTOpen:
mDTopen = True
output += tmpOutput
mLastSection = u''
mInPre = False
if char == u';':
# FIXME: This is dupe of code above
if findColonNoLinks(t, term, t2) != False:
t = t2
output += term
tmpOutput, tmpMDTopen = nextItem(u':', mDTopen)
output += tmpOutput
if tmpMDTopen is not None:
mDTopen = tmpMDTopen
commonPrefixLength += 1
lastPrefix = pref2
if prefixLength == 0:
# No prefix (not in list)--go to paragraph mode
# XXX: use a stack for nestable elements like span, table and div
openmatch = _openMatchPat.search(t)
_closeMatchPat = re.compile(ur"('
mInPre = False
mLastSection = u'pre'
t = t[1:]
else:
# paragraph
if t.strip() == u'':
if paragraphStack:
output += paragraphStack + u'
'
paragraphStack = False
mLastSection = u'p'
else:
if mLastSection != u'p':
output += closeParagraph(mLastSection)
mLastSection = u''
mInPre = False
paragraphStack = u''
else:
paragraphStack = u'
'
else:
if paragraphStack:
output += paragraphStack
paragraphStack = False
mLastSection = u'p'
elif mLastSection != u'p':
output += closeParagraph(mLastSection) + u'
'
mLastSection = u'p'
mInPre = False
# somewhere above we forget to get out of pre block (bug 785)
if preCloseMatch and mInPre:
mInPre = False
if paragraphStack == False:
output += t + u"\n"
while prefixLength:
output += closeList(pref2[prefixLength-1], mDTopen)
mDTopen = False
prefixLength -= 1
if mLastSection != u'':
output += u'' + mLastSection + u'>'
mLastSection = u''
return output
def parse(text, showToc=True):
"""docstring for parse"""
utf8 = isinstance(text, str)
text = to_unicode(text)
if text[-1:] != u'\n':
text = text + u'\n'
taggedNewline = True
else:
taggedNewline = False
mStripState = {}
mUniqPrefix = u"\x07UNIQ" + unicode(random.randint(1, 1000000000))
text = strip(text, mStripState, mUniqPrefix)
text = removeHtmlTags(text)
text = replaceVariables(text)
text = doTableStuff(text, mStripState)
text = parseHorizontalRule(text)
text, toc = checkTOC(text)
text = parseHeaders(text)
text = parseAllQuotes(text)
text = replaceInternalLinks(text)
text = replaceExternalLinks(text)
if not toc and text.find(u"") == -1:
showToc = False
text = formatHeadings(text, True, showToc, mStripState)
text = unstrip(text, mStripState)
text = fixtags(text)
text = doBlockLevels(text, True, mUniqPrefix)
text = unstripNoWiki(text, mStripState)
if taggedNewline and text[-1:] == u'\n':
text = text[:-1]
if utf8:
return text.encode("utf-8")
return text
def truncate_url(url, length=40):
if len(url) <= length:
return url
import re
pattern = r'(/[^/]+/?)$'
match = re.search(pattern, url)
if not match:
return url
l = len(match.group(1))
domain = url.replace(match.group(1), '')
firstpart = url[0:len(url)-l]
secondpart = match.group(1)
if firstpart == firstpart[0:length-3]:
secondpart = secondpart[0:length-3] + '...'
else:
firstpart = firstpart[0:length-3]
secondpart = '...' + secondpart
t_url = firstpart+secondpart
return t_url
def to_unicode(text, charset=None):
"""Convert a `str` object to an `unicode` object.
If `charset` is given, we simply assume that encoding for the text,
but we'll use the "replace" mode so that the decoding will always
succeed.
If `charset` is ''not'' specified, we'll make some guesses, first
trying the UTF-8 encoding, then trying the locale preferred encoding,
in "replace" mode. This differs from the `unicode` builtin, which
by default uses the locale preferred encoding, in 'strict' mode,
and is therefore prompt to raise `UnicodeDecodeError`s.
Because of the "replace" mode, the original content might be altered.
If this is not what is wanted, one could map the original byte content
by using an encoding which maps each byte of the input to an unicode
character, e.g. by doing `unicode(text, 'iso-8859-1')`.
"""
if not isinstance(text, str):
if isinstance(text, Exception):
# two possibilities for storing unicode strings in exception data:
try:
# custom __str__ method on the exception (e.g. PermissionError)
return unicode(text)
except UnicodeError:
# unicode arguments given to the exception (e.g. parse_date)
return ' '.join([to_unicode(arg) for arg in text.args])
return unicode(text)
if charset:
return unicode(text, charset, 'replace')
else:
try:
return unicode(text, 'utf-8')
except UnicodeError:
return unicode(text, locale.getpreferredencoding(), 'replace')