Detect blog platform

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import pycurl
from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
from urlparse import urlparse

def get_doc(url):
    try:
        c = pycurl.Curl()

        c.setopt(pycurl.URL, url)
        import StringIO
        b = StringIO.StringIO()
        c.setopt(pycurl.WRITEFUNCTION, b.write)
        c.setopt(pycurl.TIMEOUT, 5)
        c.setopt(pycurl.CONNECTTIMEOUT, 5)
        c.setopt(pycurl.MAXREDIRS, 5)
        #c.setopt(pycurl.PROXY, 'localhost:7654') # ssh tunnelling
        #c.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5)

        c.perform()
        c.close()
        return b.getvalue()
    except:
        return None

def get_domain(d):
    pos = d.rfind('.', 0, d.rfind('.'))
    if pos > 0:
        return d[pos+1:]
    else:
        return d

BSPs = (
    'Blogger', # 0
    'Live Spaces', # 1
    'LiveJournal', # 2
    'WordPress', # 3
    'AOL Journal', # 4
    'XANGA', # 5
    'Typepad', # 6
    'MySpace', # 7
    'Movable Type', # 8
)

DomainName2BSPMap = {
    'blogspot.com':BSPs[0],
    'live.com':BSPs[1],
    'livejournal.com':BSPs[2],
    'wordpress.com':BSPs[3],
    'aol.com':BSPs[4],
    'xanga.com':BSPs[5],
    'typepad.com':BSPs[6],
    'myspace.com':BSPs[7],
}

BSPMetaGeneratorMap = {
    'blogger':BSPs[0],
    'wordpress':BSPs[3],
    'typepad':BSPs[6],
    'movable type':BSPs[8],
    'movabletype':BSPs[8],
    'live spaces':BSPs[1],
}

def guess(url):
    '''Guess blog platform according a url.

    1. If url match domain pattern, return;
    2. If meta generator match, return;
    3. If RSD engine name match, return;
    4. Else return Other.

    REF:
        http://cyber.law.harvard.edu/blogs/gems/tech/rsd.html
    '''
    url = url.lower()
    t = urlparse(url)
    domain = get_domain(t[1])
    if DomainName2BSPMap.has_key(domain):
        return DomainName2BSPMap[domain]

    html = get_doc(url)
    if html:
        metaStrainer = SoupStrainer('meta', attrs={'name':'generator'})
        metas = [meta for meta in BeautifulSoup(html, parseOnlyThese=metaStrainer)]
        if metas:
            generator = metas[0]['content'].lower()
            for k, v in BSPMetaGeneratorMap.iteritems():
                if k in generator:
                    return v

        linkStrainer = SoupStrainer('link', title="RSD")
        links = [link for link in BeautifulSoup(html, parseOnlyThese=linkStrainer)]
        if links:
            rsd_url = str(links[0]['href'])
            rsd = get_doc(rsd_url)
            if rsd:
                soup = BeautifulStoneSoup(rsd)
                try:
                    enginename = soup.rsd.service.enginename.string.lower()
                    for k, v in BSPMetaGeneratorMap.iteritems():
                        if k in enginename:
                            return v

                except AttributeError:
                    pass
    return 'Other'

if __name__ == '__main__':
    urls = [
    'http://www.boingboing.net/', # Movable Type
    'http://www.engadget.com/', # Other
    'http://www.gizmodo.com/',
    'http://www.techcrunch.com/', # WordPress
    'http://www.huffingtonpost.com/',
    'http://www.lifehacker.com/',
    'http://arstechnica.com/',
    'http://postsecret.blogspot.com/',
    'http://www.dailykos.com/',
    'http://michellemalkin.com/',
    'http://www.tmz.com/',
    'http://www.ilemoned.com/', # WordPress
    'http://headrush.typepad.com/',
    'http://thinkprogress.org/',
    'http://googleblog.blogspot.com/',
    'http://sethgodin.typepad.com/',
    'http://yanxi.bokewu.com/',
    'http://www.crooksandliars.com/',
    'http://www.kotaku.com/',
    'http://www.beppegrillo.it/', # Movable Type

    'http://rateyourstudents.blogspot.com/',
    'http://ninas72.spaces.live.com/',
    'http://eshm.livejournal.com/',
    'http://blogs4brownback.wordpress.com/',
    'http://journals.aol.com/dailypulseblog/citizenjournalism/',
    'http://www.xanga.com/MonchiKi',
    'http://etherbrian.typepad.com/',
    'http://www.myspace.com/nathanfillion',
    ]

    for url in urls:
        print url, guess(url)

More like this

  1. Type less with newforms admin by ncw 5 years, 9 months ago
  2. Improved model select field for generic relationships by kratorius 4 years, 9 months ago
  3. Unobtrusive comment moderation by ubernostrum 7 years, 1 month ago
  4. Caching XHTML render_to_response by smoonen 5 years, 9 months ago
  5. Get object/list or None by lokesh 4 years, 7 months ago

Comments

(Forgotten your password?)