简单抓取FreeBuf工具列表 | 安云网

发表评论
322 次浏览

A+

想把FreeBuf上介绍的所有工具整理一个列表，然后找出那些自己感兴趣而又没有了解过的工具，本来想着用BurpSuite简单的抓取下，但是操作发现Burp的正则匹配存在问题。于是花了点时间配置notepad++用python写个脚本抓取下，分享出来,给有需要的人，并通晓此类的问题的简单处理方法。

具体如下:

python脚本:

#coding=utf-8

'''
Freebuf工具抓取脚本

http://hi.baidu.com/l34rn

'''

import re,sys
from urllib     import urlopen
from Queue      import Queue
from threading  import Thread
from time       import strftime


baseUrl='http://www.freebuf.com/tools/page/'
outPut='FreebufToolsListX.html'
pageNum=32+1
threadNum=10
urlList=[]
threadList=[]
urlNum=0
    
def spiderIndex(url):
    '''
    spider function
    '''
    global urlNum
    try:
        res=urlopen(url)
    except Exception,e:
        print '[-] [%s] [Error] [%s]'
    if res.getcode()==200:
        html=res.read()
        lines=html.split('\n')
        for line in lines:
            rex=re.search(r'(<dt><a href=\")(http://www.freebuf.com/tools/\d*\.html)(\" target=\"_blank\">).*',line)
            if rex!=None:
                urlNum+=1
                urlList.append(rex.group())
                sys.stdout.write('\r[*] [%s] [Working] [%s]'%(str(strftime('%X')) ,str(urlNum)))
    
class WorkThread(Thread):
    '''
    work thread
    '''
    def __init__(self,q):
        Thread.__init__(self)
        self.q=q
    def run(self):
        while True:
            if self.q.empty()==True:
                break
            _url=baseUrl+str(self.q.get())
            spiderIndex(_url)
            
def main():
    '''
    main function
    '''
    q=Queue(maxsize=0)
    for i in xrange(1,pageNum,1):
        q.put(i)
            
    print '[+] [%s] [Start]'%strftime('%X')
    print '[*] [%s] [http://hi.baidu.com/l34rn]'%strftime('%X')
                        
    spiderIndex('http://www.freebuf.com/tools')
            
    for i in xrange(threadNum):
        t=WorkThread(q)
        threadList.append(t)
    
    for i in threadList:
        i.start()
    
    for i in threadList:
        i.join()
    
    f=open(outPut,'ab')
    f.write('<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n')
    f.write('<title>Freebuf Tools List</title>\n')
    f.write('<center><h1><b>Freebuf Tools List</b></h1>\n'+'Time:'+str(strftime("%Y-%b-%d %X"))+'  Count:'+str(len(urlList))+'</center><hr/>\n<h5>\n')
    for line in urlList:
        f.write(line+'</br>\n')
    f.write('</h5><hr/><center><a href="http://hi.baidu.com/l34rn">Powered By L34Rn</a></center>')
    f.close()

    print '\n[+] [%s] [End] [All Done!]'%strftime('%X')
    print '[+] [%s] [Save As] [%s]'%(strftime('%X'),outPut)
    
if __name__=='__main__':
    main()