伪造header突破ip限制抓取网站内容

Roy
2013-11-15
有些网站为了防止爬虫抓取而设定了某些条件，比如如果一个ip某时间段内访问量很大，则禁止这个ip访问。如果这个网站启用了cdn加速服务的话，可以修改header中的host，并在url中直接用ip进行网站的访问。
#coding=utf-8

import urllib2
import random
import os,sys
DIRNAME = os.path.dirname(os.path.abspath(__file__))
class Gethtml_mouser():
    def __init__(self,key,debug=False,**dict_args):
        self.key = key
        self.debug = debug
        self.i_headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:23.0) Gecko/20100101 Firefox/23.0",
                          "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                          "Host":"www.mouser.com",
                          "Accept-Language":"en-US",
                          "Cookie":"preferences=ps=www&pl=en-US&pc_www=USDu; SDG1=1; SDG2=0; SDG3=0;"
                          }#伪造一个抓取USD的cookie

        ''' 如有效 就代表查询详细信息页面url 并且判断Host头信息的域名取值 '''
        self.info_url   = dict_args.get('info_url', '')
        if self.info_url and 'cn.mouser.com' in self.info_url:
            self.i_headers['Host']  = 'cn.mouser.com'
        self.get_iplist()
        self.creat_temip()
    def get_iplist(self):
        '''获取mouser_ip列表，请保证同目录下存在mouser_ip.txt'''
        iptxt = file(os.path.join(DIRNAME,'mouser_ip.txt'),'r')
        ipt = iptxt.read()
        self.iplist = ipt.split('n')
        iptxt.close()
    def log(self,flag):
        '''日志记录，生成mouser_log.txt为以后记录分析提供数据'''
        try:
            f = file(os.path.join(DIRNAME,'mouser_log.txt'),'a')
            if flag == 0:
                log = "%s,sucessn" % (self.tem_ip)   #此处的n为了分析日志用

                if self.debug: print log
                f.write(log)
            elif flag == 1:
                log = "%s,failn" % (self.tem_ip)
                if self.debug: print "%s is failed,try next ip..." % (self.tem_ip)
                f.write(log)
            else:
                if self.debug: print "log error"
            f.close()
        except Exception,e:
            if self.debug: print str(e)
            pass#防止由于服务器上没有写权限出错。

    def creat_temip(self):
        '''随机获取一个临时ip'''
        ip_len = len(self.iplist)
        ip_num = random.randint(0,ip_len-1)
        self.tem_ip = self.iplist[ip_num]
    def get_html(self,max_num=10):
        '''
         根据获取的临时ip构造最终url并返回所抓取的html
         如果发生异常则更换ip继续自动请求直到获取成功
         prams为空使用GET请求
        '''
        if max_num > 0:
            if self.debug: print 'there are %s changes to get html' % max_num
            now_num     = max_num - 1
            params={}
            if self.info_url:
                final_url   = self.info_url
            elif not self.key.startswith('ProductDetail'):
                final_url = "http://%s/Search/Refine.aspx?Keyword=%s" % (self.tem_ip,self.key)
            else:
                final_url = "http://%s/%s" % (self.tem_ip,self.key)
            if self.debug: print final_url
            req = urllib2.Request(final_url, headers=self.i_headers)
            try:
                if self.debug: print 'begin'
                if self.debug: print "use ip:",self.tem_ip
                page = urllib2.urlopen(req,timeout=15)#15秒超时

                self.hc = page.read()
                info_url = page.geturl()
                self.hc = '<info_url>%s</info_url>' % str(info_url) + str(self.hc)
                #print len(hc)

                #open('tmpa.html','w').write(hc)

                self.log(0)
                if self.debug: print 'end'
                return self.hc
            except urllib2.HTTPError, e:  
                if self.debug: print "Error Code:", e.code
                if self.debug: print "Error context:",e
                self.log(1)
                self.creat_temip()
                self.get_html(now_num)
            except urllib2.URLError, e:  
                if self.debug: print "Error Reason:", e.reason
                self.log(1)
                self.creat_temip()
                self.get_html(now_num)
            except:
                if self.debug: print "TimeOut error!"
                self.log(1)
                self.creat_temip()
                self.get_html(now_num)
        else:
            if self.debug: print 'Find error (had use max_num changes) ""'
            return ''