Metalinks / Code / [r817] /crawler/crawler.py

[r817]: / crawler / crawler.py Maximize Restore History

270 lines (230 with data), 8.4 kB

#!/usr/bin/env python
########################################################################
#
# Project: Metalinks
# URL: http://www.nabber.org/projects/
# E-mail: webmaster@nabber.org
#
# Copyright: (C) 2010, Neil McNab
# License: GNU General Public License Version 2
#   (http://www.gnu.org/copyleft/gpl.html)
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# Filename: $URL: https://appupdater.svn.sourceforge.net/svnroot/appupdater/server_tools/crawler.py $
# Last Updated: $Date: 2010-01-16 18:28:50 -0800 (Sat, 16 Jan 2010) $
# Author(s): Neil McNab
#
# Description:
#   Crawls webpages looking for metalinks.
#
# TODO check for metalink header from mirrorbrain websites.
########################################################################

import urllib
import os
import ConfigParser
import optparse
import urlparse
import hashlib
import HTMLParser
import time
import threading
import robotparser

#CACHEDIR = "htmlcache"
METADIR = "/var/www/nabber/projects/metalink/crawler/index/"

CACHETIME = 3600*24*6
PAUSETIME = 1
SKIPEXT = ["magnet","rpm","jpg","mpg","iso","gif","png","drpm","img","zip",'dmg','md5','sha1','sha256','exe','txt','gz','bz2','cz','sh','pdf']

class LinkParser(HTMLParser.HTMLParser):
    def __init__(self, url):
        HTMLParser.HTMLParser.__init__(self)
        self.url = url
        self.links = []

    def handle_starttag(self, tag, attrs):
        if tag == "a":
            for attr in attrs:
                if attr[0] == "href":
                    self.links.append(urlparse.urljoin(self.url, attr[1]))

def parse_config(configfile="config.ini"):
    configdict = {}
    config = ConfigParser.RawConfigParser()
    config.read(configfile)
    for sect in config.sections():
        configdict[sect] = {}
        for options in config.items(sect):
            configdict[sect][options[0]] = options[1]
    return configdict                    
                    
class App(threading.Thread):
    def __init__(self, url, debug = False, obey_robots=True, nocrawl=False):
        threading.Thread.__init__(self)
        self.queue = []
        self.indexed = []
        self.metalink_count = 0
        self.rp = None

        self.url = url
        self.debug = debug
        self.obey_robots = obey_robots
        self.nocrawl = nocrawl

        self.add(url)

    def download_metalink(self, url):
        self.metalink_count += 1
        filename = os.path.join(METADIR, os.path.basename(url))
        if not os.path.exists(METADIR):
            # if METADIR is a symlink we need to catch the error
            try:
                os.makedirs(METADIR)
            except: pass
    
        mtime = 0
        if os.access(filename, os.F_OK):
            mtime = os.stat(filename)[8]
        
        if mtime + CACHETIME > time.time():
            if self.debug:
                print filename, "already cached."
            return False
        
        try:
            urllib.urlretrieve(url, filename)
        except IOError:
            print "IOError for", filename
    
        if self.debug:
            print filename, "updated."
        time.sleep(PAUSETIME)
        return True

    def process_html(self, text, url=''):
        parser = LinkParser(url)
        try:
            parser.feed(text)
        except HTMLParser.HTMLParseError:
            pass
        
        for link in parser.links:
            if os.path.basename(link).startswith("?view="):
                pass
            elif link.endswith(".metalink") or link.endswith(".meta4"):
                self.download_metalink(link)
            elif not self.nocrawl:
                self.add(link)

    def crawl(self, webpage):
        m = hashlib.md5()
        m.update(webpage)

        if not self.check_robots(webpage):
            print "Crawling prohibited by robots.txt:", webpage
            return

        try:
            handle = urllib.urlopen(webpage)
        except:
            print "Webpage Download Error:", webpage
            return
        try:
            text = handle.read()
        except MemoryError:
            print "Memory Error:", webpage
            return
	self.indexed.append(handle.geturl())
        handle.close()

        time.sleep(PAUSETIME)
        
        self.process_html(text, handle.geturl())
        return

    def add(self, page):
        if not page.startswith(os.path.dirname(self.url)):
            return
        loc = page.rfind("#")
        if loc != -1:
            page = page[:page.rfind("#")]
        ext = page[page.rfind(".")+1:].lower()
        if ext in SKIPEXT:
            return
        if page in self.queue:
            return
        if page in self.indexed:
            return
        self.queue.append(page)

    def init_robots(self):
        if not self.obey_robots:
            return
        self.rp = robotparser.RobotFileParser()
        parts = urlparse.urlparse(self.url)
        self.rp.set_url(parts.scheme + '://' + parts.netloc + "/robots.txt")
        try:
            self.rp.read()
        except IOError:
            print "Connection failed: ", self.url

    def check_robots(self, url):
        if (not self.obey_robots) or (self.rp == None):
            return True
        return self.rp.can_fetch("*", url)

    def run(self):
        self.init_robots()
        starttime = time.time()
        while len(self.queue) > 0:
            page = self.queue.pop()
            if self.debug:
                print "Queue len:", len(self.queue), "Metalink Count:", self.metalink_count, "Run time:", int(time.time() - starttime), "seconds"
                print "Crawling", page
            self.indexed.append(page)
            self.crawl(page)

        self.runtime = time.time() - starttime

        
def get_version():
    return "Version 1.0" 

def run():
    parser = optparse.OptionParser(usage = "usage: %prog [options]")
    parser.add_option("--version", dest="printversion", action="store_true", help="Display the version information for this program")
    parser.add_option("-t", dest="threaded", action="store_true", help="Run each website in its own thread")
    parser.add_option("-d", dest="debug", action="store_true", help="Show full printout information")
    parser.add_option("-r", dest="robots", action="store_false", help="Ignore robots.txt")
    parser.add_option("-c", dest="nocrawl", action="store_true", help="Do not crawl (single page)")
 
    parser.set_defaults(robots=True, nocrawl=False)
    (options, args) = parser.parse_args()

    if options.printversion != None:
        print get_version()
        return

    nocrawl = {}
    if len(args) > 0:
        items = args
        for key in args:
            nocrawl[key] = {"crawl": options.nocrawl}
    else:
        config = parse_config()
        items = config.keys()

    apps = []

    if options.threaded:
        for key in items:
            nocrawl = False
            try:
                nocrawl = config[key]["crawl"]
            except: pass
            app = App(key, options.debug, options.robots, nocrawl)
            apps.append(app)
            app.start()
            time.sleep(PAUSETIME)
    else:
        for key in items:
            nocrawl = False
            try:
                nocrawl = config[key]["crawl"]
            except: pass
            app = App(key, options.debug, options.robots, nocrawl)
            apps.append(app)
            app.run()

    while (threading.activeCount() > 1):
        time.sleep(1)

    for app in apps:
        timepermetalink = 0
        try:
            timepermetalink = app.runtime/app.metalink_count
        except: pass
        print app.url
        print "Metalink Count:", app.metalink_count, "Runtime in minutes:", int(app.runtime/60), "Time/Metalink:", timepermetalink

    return

if __name__=="__main__":
    run()
        
Metalinks Code

[r817]: / crawler / crawler.py Maximize Restore History

270 lines (230 with data), 8.4 kB