« Code SummarizingBot » : différence entre les versions
Aller à la navigation
Aller à la recherche
(Page créée avec « <nowiki> from urllib.request import urlopen import requests from urllib.parse import quote from bs4 import BeautifulSoup from collections import OrderedDict from geotext... ») |
(Aucune différence)
|
Dernière version du 7 mai 2019 à 12:32
from urllib.request import urlopen
import requests
from urllib.parse import quote
from bs4 import BeautifulSoup
from collections import OrderedDict
from geotext import GeoText
import re
# Login information
user='MasterBot'
passw='dhbot2019'
baseurl = 'http://wikipast.epfl.ch/wikipast/'
summary = 'Wikipastbot update'
# Login request
payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'}
r1=requests.post(baseurl + 'api.php', data=payload)
# Login confirm
login_token=r1.json()['query']['tokens']['logintoken']
payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token}
r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies)
# Get edit token2
params3='?format=json&action=query&meta=tokens&continue='
r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies)
edit_token=r3.json()['query']['tokens']['csrftoken']
edit_cookie=r2.cookies.copy()
edit_cookie.update(r3.cookies)
# Regexp for definet the type of datum with which we work
content_reexp = re.compile("^\s?(\d{4}\.?\d{0,2}\.?\d{0,2})(\s?\/\s?)?(([\w\s\-,]+))?\.?\s?(.+)$") # Pattern for data
reference_reexp = re.compile("\s*\[\d+\]\s*") # Pattern for reference
date_reexp = re.compile("^\d{4}\.?\d{0,2}\.?\d{0,2}") # Pattern to date
# Function to get list of personages
def get_all_personages():
response = urlopen("http://wikipast.epfl.ch/wikipast/index.php/Biographies")
page_source = response.read()
soup = BeautifulSoup(page_source,'html.parser')
result = []
for primitive in soup.findAll("body"):
for tableTag in primitive.findAll("table"):
for trTag in tableTag.findAll("tr"):
for tdTag in trTag.findAll("td"):
for aTag in tdTag.findAll("a"):
if aTag.string != None:
result.append(aTag.string)
for idx, _ in enumerate(result): # idx & _ are name of our values in table of results (1:Andre Breton)
result[idx] = str(result[idx].replace(' ', '_')) # Reformatting the data to get correct link to get access to Biographie
return result
# Function for verification of personage data
def is_valid_personage_line(datum): # Introduce initial values
date = datum[0].strip()
city = None
info = None
# Define the type of datum by their length
if len(datum) == 3: # We have date, city and info
city = datum[1].strip()
info = datum[2].strip()
elif len(datum) == 2: # We have date and info
info = datum[1].strip()
else:
return False
# Validate city & Validate date
if not date_reexp.match(date): # Date don't pass the verification of pattern
return False
if city:
places = GeoText(city)
if len(places.cities) == 0:
if city.count(' ') > 2 or '1' in city: # Check if in place of city we add date or not correct name of city
return False
return True
# Function for generate summarizing of Biographie
def generate_text_for_personage(name, data):
result = ''
last_city = None
for datum in data: # Checking all datum in list of data
date = datum[0]
# Remove last dot from date
if date[-1] == '.': date = date[:-1]
city = datum[1].strip() if len(datum) == 3 else None
info = datum[-1]
# Remove incorrect information
if len(info) < 5: continue
# Remove last dot from datum-info (sentence)
if info[-1] == '.': info = info[:-1]
# Add some prefix to date, to make text more readably
date_prefix = 'En ' if len(date) == 4 else 'Le '
# Removing city in case if city repat for two consecutive datum or put prefix
need_city = city != last_city and city != '-'
city_postfix = ((' a ' + city) if city and need_city else '') + '. '
result += date_prefix + date + ' ' + info + city_postfix # Final info, more readable sentence
last_city = city
return result
def main(*args):
pages = ""
if len(args) == 0:
pages = get_all_personages()
else:
pages = args
data = {} # List of key:value for all data find
correct_data_ratio = {} # List of key:value which keep ratio of lossing info for each personnage
# Inital values
total_number_of_content_lines = 0
total_number_of_correct_lines = 0
# Collecting all data from pages of Biograhpie
for personage in pages:
site = ("http://wikipast.epfl.ch/wikipast/index.php/" + quote(personage))
response = urlopen(site)
page_source = response.read()
soup = BeautifulSoup(page_source,'html.parser')
content_div = soup.find(id="mw-content-text") # Defined the principle block of Biographie for each personage
content_lines = content_div.findAll("li") # Defined line of datum
data[personage] = [] # Relate data to personage
total_number_of_content_lines += len(content_lines) # Counting the total number of lines which we find
for content_line in content_lines: # Treatment each line in data we extrect for each personnage
content_text = content_line.get_text() # From datum extract text
content_match = content_reexp.match(content_text) # Checking our extracting text with pattern of text
if content_match: # If checking succeed
findings = content_reexp.findall(content_text)
findings_as_array = [x for xs in findings for x in xs] # Made the array of content and personage
findings_no_duplicates = list(OrderedDict.fromkeys(findings_as_array))
findings_no_duplicates.pop(1) # Removing duplicated datum
# Check if last is a reference
if reference_reexp.match(findings_no_duplicates[-1]): findings_no_duplicates.pop(-1)
if is_valid_personage_line(findings_no_duplicates): data[personage].append(findings_no_duplicates)
if len(content_lines) != 0: correct_data_ratio[personage] = len(data[personage]) / len(content_lines) # The ratio of correct data for personnage
else: correct_data_ratio[personage] = 1 # It's mean that we read zero lines in Biographie of personnage
total_number_of_correct_lines += len(data[personage]) # Counting total number of lines which we can process to use in summarizing
print(generate_text_for_personage(personage, data[personage])) # Checking of text which be used for summarizing
correct_data_ratio_overall = total_number_of_correct_lines / total_number_of_content_lines # Checking percentage of data losing after all iteration
print(correct_data_ratio_overall)