Detect blog platform.
As we all known, there are so many blog platform in the wild, e.g. Blogger.com, WordPress, LiveJournal, Movable Type etc.
This little snippet could guess the blog platform according a url.
Dependency:
- pycurl
- BeautifulSoup
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | import pycurl
from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
from urlparse import urlparse
def get_doc(url):
try:
c = pycurl.Curl()
c.setopt(pycurl.URL, url)
import StringIO
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.TIMEOUT, 5)
c.setopt(pycurl.CONNECTTIMEOUT, 5)
c.setopt(pycurl.MAXREDIRS, 5)
#c.setopt(pycurl.PROXY, 'localhost:7654') # ssh tunnelling
#c.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5)
c.perform()
c.close()
return b.getvalue()
except:
return None
def get_domain(d):
pos = d.rfind('.', 0, d.rfind('.'))
if pos > 0:
return d[pos+1:]
else:
return d
BSPs = (
'Blogger', # 0
'Live Spaces', # 1
'LiveJournal', # 2
'WordPress', # 3
'AOL Journal', # 4
'XANGA', # 5
'Typepad', # 6
'MySpace', # 7
'Movable Type', # 8
)
DomainName2BSPMap = {
'blogspot.com':BSPs[0],
'live.com':BSPs[1],
'livejournal.com':BSPs[2],
'wordpress.com':BSPs[3],
'aol.com':BSPs[4],
'xanga.com':BSPs[5],
'typepad.com':BSPs[6],
'myspace.com':BSPs[7],
}
BSPMetaGeneratorMap = {
'blogger':BSPs[0],
'wordpress':BSPs[3],
'typepad':BSPs[6],
'movable type':BSPs[8],
'movabletype':BSPs[8],
'live spaces':BSPs[1],
}
def guess(url):
'''Guess blog platform according a url.
1. If url match domain pattern, return;
2. If meta generator match, return;
3. If RSD engine name match, return;
4. Else return Other.
REF:
http://cyber.law.harvard.edu/blogs/gems/tech/rsd.html
'''
url = url.lower()
t = urlparse(url)
domain = get_domain(t[1])
if DomainName2BSPMap.has_key(domain):
return DomainName2BSPMap[domain]
html = get_doc(url)
if html:
metaStrainer = SoupStrainer('meta', attrs={'name':'generator'})
metas = [meta for meta in BeautifulSoup(html, parseOnlyThese=metaStrainer)]
if metas:
generator = metas[0]['content'].lower()
for k, v in BSPMetaGeneratorMap.iteritems():
if k in generator:
return v
linkStrainer = SoupStrainer('link', title="RSD")
links = [link for link in BeautifulSoup(html, parseOnlyThese=linkStrainer)]
if links:
rsd_url = str(links[0]['href'])
rsd = get_doc(rsd_url)
if rsd:
soup = BeautifulStoneSoup(rsd)
try:
enginename = soup.rsd.service.enginename.string.lower()
for k, v in BSPMetaGeneratorMap.iteritems():
if k in enginename:
return v
except AttributeError:
pass
return 'Other'
if __name__ == '__main__':
urls = [
'http://www.boingboing.net/', # Movable Type
'http://www.engadget.com/', # Other
'http://www.gizmodo.com/',
'http://www.techcrunch.com/', # WordPress
'http://www.huffingtonpost.com/',
'http://www.lifehacker.com/',
'http://arstechnica.com/',
'http://postsecret.blogspot.com/',
'http://www.dailykos.com/',
'http://michellemalkin.com/',
'http://www.tmz.com/',
'http://www.ilemoned.com/', # WordPress
'http://headrush.typepad.com/',
'http://thinkprogress.org/',
'http://googleblog.blogspot.com/',
'http://sethgodin.typepad.com/',
'http://yanxi.bokewu.com/',
'http://www.crooksandliars.com/',
'http://www.kotaku.com/',
'http://www.beppegrillo.it/', # Movable Type
'http://rateyourstudents.blogspot.com/',
'http://ninas72.spaces.live.com/',
'http://eshm.livejournal.com/',
'http://blogs4brownback.wordpress.com/',
'http://journals.aol.com/dailypulseblog/citizenjournalism/',
'http://www.xanga.com/MonchiKi',
'http://etherbrian.typepad.com/',
'http://www.myspace.com/nathanfillion',
]
for url in urls:
print url, guess(url)
|
More like this
- Template tag - list punctuation for a list of items by shapiromatron 10 months, 1 week ago
- JSONRequestMiddleware adds a .json() method to your HttpRequests by cdcarter 10 months, 2 weeks ago
- Serializer factory with Django Rest Framework by julio 1 year, 5 months ago
- Image compression before saving the new model / work with JPG, PNG by Schleidens 1 year, 6 months ago
- Help text hyperlinks by sa2812 1 year, 6 months ago
Comments
Please login first before commenting.