Subversion Repositories SE.SVN

Rev

Rev 39 | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 39 Rev 45
Line 24... Line 24...
24
 
24
 
25
class Crawler(threading.Thread):
25
class Crawler(threading.Thread):
26
 
26
 
27
    allowed_sites = [r'''.*''']
27
    allowed_sites = [r'''.*''']
28
    forbidden_sites = []
28
    forbidden_sites = []
-
 
29
    user_agent = 'Mozilla Firefox'
29
 
30
 
30
    db_pool =  None
31
    db_pool =  None
31
    db_table = 'crawl'
32
    db_table = 'crawl'
32
    db_sequence = 'crawl_position'
33
    db_sequence = 'crawl_position'
33
    db_domains = 'crawl_domains'
34
    db_domains = 'crawl_domains'
Line 891... Line 892...
891
            p = urllib.parse.urlparse(r['url'])
892
            p = urllib.parse.urlparse(r['url'])
892
            conn = httplib2.Http()
893
            conn = httplib2.Http()
893
#           url = urllib.parse.urlunparse(p)
894
#           url = urllib.parse.urlunparse(p)
894
#           conn.follow_redirects = False
895
#           conn.follow_redirects = False
895
            print ("FETCH:",r['url'])
896
            print ("FETCH:",r['url'])
896
            resp,body = conn.request(r['url'])
897
            resp,body = conn.request(
-
 
898
                r['url'],
-
 
899
                method="GET",
-
 
900
                headers={'user-agent': self.user_agent}
-
 
901
            )
897
            self.handle_response(r,resp,body)
902
            self.handle_response(r,resp,body)
898
            r['visits']=r['visits']+1
903
            r['visits']=r['visits']+1
899
            r['content']=body
904
            r['content']=body
900
        except:
905
        except:
901
            traceback.print_exc()
906
            traceback.print_exc()