« Code SummarizingBot » : différence entre les versions

Dernière version du 7 mai 2019 à 12:32

from urllib.request import urlopen
import requests
from urllib.parse import quote
from bs4 import BeautifulSoup
from collections import OrderedDict
from geotext import GeoText
import re


# Login information
user='MasterBot'
passw='dhbot2019'
baseurl = 'http://wikipast.epfl.ch/wikipast/'
summary = 'Wikipastbot update'

# Login request
payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'}
r1=requests.post(baseurl + 'api.php', data=payload)

# Login confirm
login_token=r1.json()['query']['tokens']['logintoken']
payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token}
r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies)

# Get edit token2
params3='?format=json&action=query&meta=tokens&continue='
r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies)
edit_token=r3.json()['query']['tokens']['csrftoken']

edit_cookie=r2.cookies.copy()
edit_cookie.update(r3.cookies)

# Regexp for definet the type of datum with which we work
content_reexp = re.compile("^\s?(\d{4}\.?\d{0,2}\.?\d{0,2})(\s?\/\s?)?(([\w\s\-,]+))?\.?\s?(.+)$") # Pattern for data
reference_reexp = re.compile("\s*\[\d+\]\s*") # Pattern for reference
date_reexp = re.compile("^\d{4}\.?\d{0,2}\.?\d{0,2}") # Pattern to date

# Function to get list of personages 
def get_all_personages():
    response = urlopen("http://wikipast.epfl.ch/wikipast/index.php/Biographies")
    page_source = response.read()
    soup = BeautifulSoup(page_source,'html.parser')
    result = []
    for primitive in soup.findAll("body"):
        for tableTag in primitive.findAll("table"):
             for trTag in tableTag.findAll("tr"):
                for tdTag in trTag.findAll("td"):
                    for aTag in tdTag.findAll("a"):
                        if aTag.string != None:
                            result.append(aTag.string)

    for idx, _ in enumerate(result): # idx & _ are name of our values in table of results (1:Andre Breton)
        result[idx] = str(result[idx].replace(' ', '_')) # Reformatting the data to get correct link to get access to Biographie
    return result

# Function for verification of personage data
def is_valid_personage_line(datum): # Introduce initial values
    date = datum[0].strip()
    city = None
    info = None

    # Define the type of datum by their length
    if len(datum) == 3: # We have date, city and info 
        city = datum[1].strip()
        info = datum[2].strip()
    elif len(datum) == 2: # We have date and info
        info = datum[1].strip()
    else:
        return False

    # Validate city & Validate date
    if not date_reexp.match(date): # Date don't pass the verification of pattern
        return False
    if city:
        places = GeoText(city) 
        if len(places.cities) == 0:
            if city.count(' ') > 2 or '1' in city: # Check if in place of city we add date or not correct name of city
                return False
    
    return True

# Function for generate summarizing of Biographie
def generate_text_for_personage(name, data):
    result = ''
    last_city = None

    for datum in data: # Checking all datum in list of data
        date = datum[0] 

        # Remove last dot from date
        if date[-1] == '.': date = date[:-1]

        city = datum[1].strip() if len(datum) == 3 else None
        info = datum[-1]

        # Remove incorrect information 
        if len(info) < 5: continue
        
        # Remove last dot from datum-info (sentence)
        if info[-1] == '.': info = info[:-1]
        
        # Add some prefix to date, to make text more readably
        date_prefix = 'En ' if len(date) == 4 else 'Le '
        # Removing city in case if city repat for two consecutive datum or put prefix
        need_city = city != last_city and city != '-'

        city_postfix = ((' a ' + city) if city and need_city else '') + '. '

        result += date_prefix + date + ' ' + info + city_postfix # Final info, more readable sentence 

        last_city = city
    return result


def main(*args):


    pages = ""
    if len(args) == 0:
        pages = get_all_personages()
    else:
        pages = args
    data = {} # List of key:value for all data find
    correct_data_ratio = {} # List of key:value which keep ratio of lossing info for each personnage 

    # Inital values 
    total_number_of_content_lines = 0
    total_number_of_correct_lines = 0

    # Collecting all data from pages of Biograhpie
    for personage in pages:
        site = ("http://wikipast.epfl.ch/wikipast/index.php/" + quote(personage))
        response = urlopen(site) 
        page_source = response.read()
        soup = BeautifulSoup(page_source,'html.parser')

        content_div = soup.find(id="mw-content-text") # Defined the principle block of Biographie for each personage
        content_lines = content_div.findAll("li") # Defined line of datum

        data[personage] = [] # Relate data to personage
        total_number_of_content_lines += len(content_lines) # Counting the total number of lines which we find
        
        for content_line in content_lines: # Treatment each line in data we extrect for each personnage
            content_text = content_line.get_text() # From datum extract text
            content_match = content_reexp.match(content_text) # Checking our extracting text with pattern of text

            if content_match: # If checking succeed
                findings = content_reexp.findall(content_text)
                findings_as_array = [x for xs in findings for x in xs] # Made the array of content and personage
                findings_no_duplicates = list(OrderedDict.fromkeys(findings_as_array)) 
                findings_no_duplicates.pop(1) # Removing duplicated datum

                # Check if last is a reference
                if reference_reexp.match(findings_no_duplicates[-1]): findings_no_duplicates.pop(-1)
            
                if is_valid_personage_line(findings_no_duplicates): data[personage].append(findings_no_duplicates)
            
        if len(content_lines) != 0: correct_data_ratio[personage] = len(data[personage]) / len(content_lines) # The ratio of correct data for personnage
        else: correct_data_ratio[personage] = 1 # It's mean that we read zero lines in Biographie of personnage

        total_number_of_correct_lines += len(data[personage]) # Counting total number of lines which we can process to use in summarizing

        print(generate_text_for_personage(personage, data[personage])) # Checking of text which be used for summarizing


    correct_data_ratio_overall = total_number_of_correct_lines / total_number_of_content_lines # Checking percentage of data losing after all iteration
    print(correct_data_ratio_overall)
« Code SummarizingBot » : différence entre les versions

Dernière version du 7 mai 2019 à 12:32

Menu de navigation

Rechercher

« Code SummarizingBot » : différence entre les versions