伪造header突破ip限制抓取网站内容
有些网站为了防止爬虫抓取而设定了某些条件,比如如果一个ip某时间段内访问量很大,则禁止这个ip访问。如果这个网站启用了cdn加速服务的话,可以修改header中的host,并在url中直接用ip进行网站的访问。
#coding=utf-8
import urllib2
import random
import os,sys
DIRNAME = os.path.dirname(os.path.abspath(__file__))
class Gethtml_mouser():
def __init__(self,key,debug=False,**dict_args):
self.key = key
self.debug = debug
self.i_headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:23.0) Gecko/20100101 Firefox/23.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Host":"www.mouser.com",
"Accept-Language":"en-US",
"Cookie":"preferences=ps=www&pl=en-US&pc_www=USDu; SDG1=1; SDG2=0; SDG3=0;"
}#伪造一个抓取USD的cookie
''' 如有效 就代表查询详细信息页面url 并且判断Host头信息的域名取值 '''
self.info_url = dict_args.get('info_url', '')
if self.info_url and 'cn.mouser.com' in self.info_url:
self.i_headers['Host'] = 'cn.mouser.com'
self.get_iplist()
self.creat_temip()
def get_iplist(self):
'''获取mouser_ip列表,请保证同目录下存在mouser_ip.txt'''
iptxt = file(os.path.join(DIRNAME,'mouser_ip.txt'),'r')
ipt = iptxt.read()
self.iplist = ipt.split('n')
iptxt.close()
def log(self,flag):
'''日志记录,生成mouser_log.txt为以后记录分析提供数据'''
try:
f = file(os.path.join(DIRNAME,'mouser_log.txt'),'a')
if flag == 0:
log = "%s,sucessn" % (self.tem_ip) #此处的n为了分析日志用
if self.debug: print log
f.write(log)
elif flag == 1:
log = "%s,failn" % (self.tem_ip)
if self.debug: print "%s is failed,try next ip..." % (self.tem_ip)
f.write(log)
else:
if self.debug: print "log error"
f.close()
except Exception,e:
if self.debug: print str(e)
pass#防止由于服务器上没有写权限出错。
def creat_temip(self):
'''随机获取一个临时ip'''
ip_len = len(self.iplist)
ip_num = random.randint(0,ip_len-1)
self.tem_ip = self.iplist[ip_num]
def get_html(self,max_num=10):
'''
根据获取的临时ip构造最终url并返回所抓取的html
如果发生异常则更换ip继续自动请求直到获取成功
prams为空使用GET请求
'''
if max_num > 0:
if self.debug: print 'there are %s changes to get html' % max_num
now_num = max_num - 1
params={}
if self.info_url:
final_url = self.info_url
elif not self.key.startswith('ProductDetail'):
final_url = "http://%s/Search/Refine.aspx?Keyword=%s" % (self.tem_ip,self.key)
else:
final_url = "http://%s/%s" % (self.tem_ip,self.key)
if self.debug: print final_url
req = urllib2.Request(final_url, headers=self.i_headers)
try:
if self.debug: print 'begin'
if self.debug: print "use ip:",self.tem_ip
page = urllib2.urlopen(req,timeout=15)#15秒超时
self.hc = page.read()
info_url = page.geturl()
self.hc = '<info_url>%s</info_url>' % str(info_url) + str(self.hc)
#print len(hc)
#open('tmpa.html','w').write(hc)
self.log(0)
if self.debug: print 'end'
return self.hc
except urllib2.HTTPError, e:
if self.debug: print "Error Code:", e.code
if self.debug: print "Error context:",e
self.log(1)
self.creat_temip()
self.get_html(now_num)
except urllib2.URLError, e:
if self.debug: print "Error Reason:", e.reason
self.log(1)
self.creat_temip()
self.get_html(now_num)
except:
if self.debug: print "TimeOut error!"
self.log(1)
self.creat_temip()
self.get_html(now_num)
else:
if self.debug: print 'Find error (had use max_num changes) ""'
return ''