Login

Detect blog platform

Author:
twinsant
Posted:
May 23, 2007
Language:
Python
Version:
.96
Score:
2 (after 2 ratings)

Detect blog platform.

As we all known, there are so many blog platform in the wild, e.g. Blogger.com, WordPress, LiveJournal, Movable Type etc.

This little snippet could guess the blog platform according a url.

Dependency:

  1. pycurl
  2. BeautifulSoup
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import pycurl
from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
from urlparse import urlparse

def get_doc(url):
    try:
        c = pycurl.Curl()

        c.setopt(pycurl.URL, url)
        import StringIO
        b = StringIO.StringIO()
        c.setopt(pycurl.WRITEFUNCTION, b.write)
        c.setopt(pycurl.TIMEOUT, 5)
        c.setopt(pycurl.CONNECTTIMEOUT, 5)
        c.setopt(pycurl.MAXREDIRS, 5)
        #c.setopt(pycurl.PROXY, 'localhost:7654') # ssh tunnelling
        #c.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5)

        c.perform()
        c.close()
        return b.getvalue()
    except:
        return None

def get_domain(d):
    pos = d.rfind('.', 0, d.rfind('.'))
    if pos > 0:
        return d[pos+1:]
    else:
        return d

BSPs = (
    'Blogger', # 0
    'Live Spaces', # 1
    'LiveJournal', # 2
    'WordPress', # 3
    'AOL Journal', # 4
    'XANGA', # 5
    'Typepad', # 6
    'MySpace', # 7
    'Movable Type', # 8
)

DomainName2BSPMap = {
    'blogspot.com':BSPs[0],
    'live.com':BSPs[1],
    'livejournal.com':BSPs[2],
    'wordpress.com':BSPs[3],
    'aol.com':BSPs[4],
    'xanga.com':BSPs[5],
    'typepad.com':BSPs[6],
    'myspace.com':BSPs[7],
}

BSPMetaGeneratorMap = {
    'blogger':BSPs[0],
    'wordpress':BSPs[3],
    'typepad':BSPs[6],
    'movable type':BSPs[8],
    'movabletype':BSPs[8],
    'live spaces':BSPs[1],
}

def guess(url):
    '''Guess blog platform according a url.

    1. If url match domain pattern, return;
    2. If meta generator match, return;
    3. If RSD engine name match, return;
    4. Else return Other.

    REF:
        http://cyber.law.harvard.edu/blogs/gems/tech/rsd.html
    '''
    url = url.lower()
    t = urlparse(url)
    domain = get_domain(t[1])
    if DomainName2BSPMap.has_key(domain):
        return DomainName2BSPMap[domain]

    html = get_doc(url)
    if html:
        metaStrainer = SoupStrainer('meta', attrs={'name':'generator'})
        metas = [meta for meta in BeautifulSoup(html, parseOnlyThese=metaStrainer)]
        if metas:
            generator = metas[0]['content'].lower()
            for k, v in BSPMetaGeneratorMap.iteritems():
                if k in generator:
                    return v

        linkStrainer = SoupStrainer('link', title="RSD")
        links = [link for link in BeautifulSoup(html, parseOnlyThese=linkStrainer)]
        if links:
            rsd_url = str(links[0]['href'])
            rsd = get_doc(rsd_url)
            if rsd:
                soup = BeautifulStoneSoup(rsd)
                try:
                    enginename = soup.rsd.service.enginename.string.lower()
                    for k, v in BSPMetaGeneratorMap.iteritems():
                        if k in enginename:
                            return v

                except AttributeError:
                    pass
    return 'Other'

if __name__ == '__main__':
    urls = [
    'http://www.boingboing.net/', # Movable Type
    'http://www.engadget.com/', # Other
    'http://www.gizmodo.com/',
    'http://www.techcrunch.com/', # WordPress
    'http://www.huffingtonpost.com/',
    'http://www.lifehacker.com/',
    'http://arstechnica.com/',
    'http://postsecret.blogspot.com/',
    'http://www.dailykos.com/',
    'http://michellemalkin.com/',
    'http://www.tmz.com/',
    'http://www.ilemoned.com/', # WordPress
    'http://headrush.typepad.com/',
    'http://thinkprogress.org/',
    'http://googleblog.blogspot.com/',
    'http://sethgodin.typepad.com/',
    'http://yanxi.bokewu.com/',
    'http://www.crooksandliars.com/',
    'http://www.kotaku.com/',
    'http://www.beppegrillo.it/', # Movable Type

    'http://rateyourstudents.blogspot.com/',
    'http://ninas72.spaces.live.com/',
    'http://eshm.livejournal.com/',
    'http://blogs4brownback.wordpress.com/',
    'http://journals.aol.com/dailypulseblog/citizenjournalism/',
    'http://www.xanga.com/MonchiKi',
    'http://etherbrian.typepad.com/',
    'http://www.myspace.com/nathanfillion',
    ]

    for url in urls:
        print url, guess(url)

More like this

  1. Template tag - list punctuation for a list of items by shapiromatron 2 months ago
  2. JSONRequestMiddleware adds a .json() method to your HttpRequests by cdcarter 2 months, 1 week ago
  3. Serializer factory with Django Rest Framework by julio 9 months, 1 week ago
  4. Image compression before saving the new model / work with JPG, PNG by Schleidens 9 months, 4 weeks ago
  5. Help text hyperlinks by sa2812 10 months, 3 weeks ago

Comments

Please login first before commenting.