IP+IDC-chinaz抓取

  • A+
所属分类:神兵利刃

#-*-coding:gbk-*-
#code by anyun.org
import urllib
import re
import time
 
 
def getHtml(url):
    page = urllib.urlopen(url)
    html = page.read()
    html = html.replace('\n', '')
    html = html.replace('       ', ' ')
    html = html.replace('   ', '')
    html = html.replace('   ', '')
    # html = html.replace(' ','')
    return html
 
 
def getcontext(html):
    reg = (r'<span class="Whwtdhalf w15-0">(.*?)</span>')
    listre = re.compile(reg)
    mylist = re.findall(listre, html)
    return mylist
 
def getadd(html):
    reg = (r'<span class="Whwtdhalf w50-0">(.*?)</span>')
    listre = re.compile(reg)
    mylist = re.findall(listre, html)
    return mylist
 
def geterr(html):
    reg = (r'<div class="col-red lh30 fz14 jspu">(.*?)</div>')
    listre = re.compile(reg)
    mylist = re.findall(listre, html)
    return mylist
 
if __name__ == '__main__':
    f =open('list.txt','r')
    for i in f.readlines():
        i=i.strip()
         
        try:
            Url='http://ip.chinaz.com/?ip=http://'+i
        except:
            print 'error'
        Html = getHtml(Url)
    #   print (getcontext(Html))
         
        if len(geterr(Html))==0:
            print getcontext(Html)[0],getcontext(Html)[3] \
            ,getcontext(Html)[1],getcontext(Html)[4] \
            ,getcontext(Html)[2],getcontext(Html)[5] \
            ,getadd(Html)[0],getadd(Html)[1]
             
            f1 = open('ok.txt','a')
            print >>f1,getcontext(Html)[0],getcontext(Html)[3] \
            ,getcontext(Html)[1],getcontext(Html)[4] \
            ,getcontext(Html)[2],getcontext(Html)[5] \
            ,getadd(Html)[0],getadd(Html)[1]
            f1.close()
             
        else:
            print i,'解析失败'
            f2=open('err.txt','a')
            print >>f2,i,'解析失败'
            f2.close()
        time.sleep(0.5)
    print 'over'

  • 我的微信
  • 这是我的微信扫一扫
  • weinxin
  • 我的微信公众号
  • 我的微信公众号扫一扫
  • weinxin

发表评论

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen: