07 октября, 2013

Граббер университетов на python.

На античате, человек обратился с просьбой написать, граббер вузов, с сайта edu-inform.


Да процесс занял несколько минут. Писал с использованием grablib от lorien. Использовал паука, вот весь код:



#!/usr/bin/env python
# -*- coding: utf-8 -*-

__author__ = 'Artur Fis'

from grab.spider import Spider,Task
from lxml.html import tostring,fromstring
from lxml.html.clean import clean_html
from sqLite import NewSqlEngine

class UniversityGrab(Spider):

    initial_urls = ['http://www.edu-inform.ru/university/?PAGEN_1=%s' % str(i) for i in range(1,15)]
    results = []
    sql = NewSqlEngine('universities.db')
    page_number = 0
    university_count = 0

    def prepare(self):
        self.page_count = len(self.initial_urls)
        self.sql.create_table('university',{'telephone':'TEXT','address':'TEXT','site':'TEXT','email':'TEXT'})

    def print_statistic(self):
        process = int((self.university_count*100)/236)
        print 'Process = ',process,'%,','count = ',self.university_count

    def task_initial(self,grab,task):
        for university_url in grab.xpath_list('//a[@class="op_text"]'):
            url = grab.make_url_absolute(university_url.get('href'))
            yield Task('grab_data',url = url)

        self.page_number +=1
        self.print_statistic()

    def task_grab_data(self,grab,task):
        #print grab.response.headers
        university = tostring(grab.xpath('//div[@class="card_content"]'))
        university_data = fromstring(university).xpath('//p')
        push_data = {'telephone':'','address':'','site':'','email':''}
        for data in university_data:
            text_data = clean_html(data).text_content()
            if isinstance(text_data,str):
                text = str(text_data)
            elif isinstance(text_data,unicode):
                text = text_data.encode('windows-1251')
            if u'Тел.' in text_data:
                push_data['telephone'] = text
            elif u'Адрес:' in text_data:
                push_data['address'] = text
            else:
                if ' ' in text_data:
                    text = text.split(' ')
                    if '@' in text[0]:
                        push_data['email'] = text[0]
                        push_data['site'] = text[1]
                    elif '@' in text[1]:
                        push_data['email'] = text[1]
                        push_data['site'] = text[0]
                else:
                    if '@' in text:
                        push_data['email'] = text
                    else:
                        push_data['site'] = text
        self.results.append(push_data)
        self.sql.put('university',push_data)
        self.university_count += 1

Grabber = UniversityGrab(thread_number=3)
Grabber.run()
Grabber.sql.close()
Grabber.print_statistic()

Комментариев нет:

Отправить комментарий