from django.http import HttpResponseForbidden
BotNames=['Googlebot','Slurp','Twiceler','msnbot','KaloogaBot','YodaoBot','"Baiduspider','googlebot','Speedy Spider','DotBot']
param_name='deny_crawlers'
class CrawlerBlocker:
def process_request(self, request):
user_agent=request.META.get('HTTP_USER_AGENT',None)
if not user_agent:
return HttpResponseForbidden('request without username are not supported. sorry')
request.is_crawler=False
for botname in BotNames:
if botname in user_agent:
request.is_crawler=True
def process_view(self, request, view_func, view_args, view_kwargs):
if param_name in view_kwargs:
if view_kwargs[param_name]:
del view_kwargs[param_name]
if request.is_crawler:
return HttpResponseForbidden('adress removed from crawling. check robots.txt')
Comments
Kinda cool. One can complete bot list here : http://www.robotstxt.org/db.html
#
using user-agent to block bots will only block/stop rocky/noob spammers. user-agent can easily be changed whatever the bot writer wants. I think, much better solution is to white list IP address block of google, msn and save bots and block all other bots.
#