1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142 | import pycurl
from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
from urlparse import urlparse
def get_doc(url):
try:
c = pycurl.Curl()
c.setopt(pycurl.URL, url)
import StringIO
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.TIMEOUT, 5)
c.setopt(pycurl.CONNECTTIMEOUT, 5)
c.setopt(pycurl.MAXREDIRS, 5)
#c.setopt(pycurl.PROXY, 'localhost:7654') # ssh tunnelling
#c.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5)
c.perform()
c.close()
return b.getvalue()
except:
return None
def get_domain(d):
pos = d.rfind('.', 0, d.rfind('.'))
if pos > 0:
return d[pos+1:]
else:
return d
BSPs = (
'Blogger', # 0
'Live Spaces', # 1
'LiveJournal', # 2
'WordPress', # 3
'AOL Journal', # 4
'XANGA', # 5
'Typepad', # 6
'MySpace', # 7
'Movable Type', # 8
)
DomainName2BSPMap = {
'blogspot.com':BSPs[0],
'live.com':BSPs[1],
'livejournal.com':BSPs[2],
'wordpress.com':BSPs[3],
'aol.com':BSPs[4],
'xanga.com':BSPs[5],
'typepad.com':BSPs[6],
'myspace.com':BSPs[7],
}
BSPMetaGeneratorMap = {
'blogger':BSPs[0],
'wordpress':BSPs[3],
'typepad':BSPs[6],
'movable type':BSPs[8],
'movabletype':BSPs[8],
'live spaces':BSPs[1],
}
def guess(url):
'''Guess blog platform according a url.
1. If url match domain pattern, return;
2. If meta generator match, return;
3. If RSD engine name match, return;
4. Else return Other.
REF:
http://cyber.law.harvard.edu/blogs/gems/tech/rsd.html
'''
url = url.lower()
t = urlparse(url)
domain = get_domain(t[1])
if DomainName2BSPMap.has_key(domain):
return DomainName2BSPMap[domain]
html = get_doc(url)
if html:
metaStrainer = SoupStrainer('meta', attrs={'name':'generator'})
metas = [meta for meta in BeautifulSoup(html, parseOnlyThese=metaStrainer)]
if metas:
generator = metas[0]['content'].lower()
for k, v in BSPMetaGeneratorMap.iteritems():
if k in generator:
return v
linkStrainer = SoupStrainer('link', title="RSD")
links = [link for link in BeautifulSoup(html, parseOnlyThese=linkStrainer)]
if links:
rsd_url = str(links[0]['href'])
rsd = get_doc(rsd_url)
if rsd:
soup = BeautifulStoneSoup(rsd)
try:
enginename = soup.rsd.service.enginename.string.lower()
for k, v in BSPMetaGeneratorMap.iteritems():
if k in enginename:
return v
except AttributeError:
pass
return 'Other'
if __name__ == '__main__':
urls = [
'http://www.boingboing.net/', # Movable Type
'http://www.engadget.com/', # Other
'http://www.gizmodo.com/',
'http://www.techcrunch.com/', # WordPress
'http://www.huffingtonpost.com/',
'http://www.lifehacker.com/',
'http://arstechnica.com/',
'http://postsecret.blogspot.com/',
'http://www.dailykos.com/',
'http://michellemalkin.com/',
'http://www.tmz.com/',
'http://www.ilemoned.com/', # WordPress
'http://headrush.typepad.com/',
'http://thinkprogress.org/',
'http://googleblog.blogspot.com/',
'http://sethgodin.typepad.com/',
'http://yanxi.bokewu.com/',
'http://www.crooksandliars.com/',
'http://www.kotaku.com/',
'http://www.beppegrillo.it/', # Movable Type
'http://rateyourstudents.blogspot.com/',
'http://ninas72.spaces.live.com/',
'http://eshm.livejournal.com/',
'http://blogs4brownback.wordpress.com/',
'http://journals.aol.com/dailypulseblog/citizenjournalism/',
'http://www.xanga.com/MonchiKi',
'http://etherbrian.typepad.com/',
'http://www.myspace.com/nathanfillion',
]
for url in urls:
print url, guess(url)
|
Comments
Test code
from blogdetect import get_domain, guess
import unittest
class TestBlogDetect(unittest.TestCase):
#