Code SummarizingBot
Aller à la navigation
Aller à la recherche
from urllib.request import urlopen import requests from urllib.parse import quote from bs4 import BeautifulSoup from collections import OrderedDict from geotext import GeoText import re # Login information user='MasterBot' passw='dhbot2019' baseurl = 'http://wikipast.epfl.ch/wikipast/' summary = 'Wikipastbot update' # Login request payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'} r1=requests.post(baseurl + 'api.php', data=payload) # Login confirm login_token=r1.json()['query']['tokens']['logintoken'] payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token} r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies) # Get edit token2 params3='?format=json&action=query&meta=tokens&continue=' r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies) edit_token=r3.json()['query']['tokens']['csrftoken'] edit_cookie=r2.cookies.copy() edit_cookie.update(r3.cookies) # Regexp for definet the type of datum with which we work content_reexp = re.compile("^\s?(\d{4}\.?\d{0,2}\.?\d{0,2})(\s?\/\s?)?(([\w\s\-,]+))?\.?\s?(.+)$") # Pattern for data reference_reexp = re.compile("\s*\[\d+\]\s*") # Pattern for reference date_reexp = re.compile("^\d{4}\.?\d{0,2}\.?\d{0,2}") # Pattern to date # Function to get list of personages def get_all_personages(): response = urlopen("http://wikipast.epfl.ch/wikipast/index.php/Biographies") page_source = response.read() soup = BeautifulSoup(page_source,'html.parser') result = [] for primitive in soup.findAll("body"): for tableTag in primitive.findAll("table"): for trTag in tableTag.findAll("tr"): for tdTag in trTag.findAll("td"): for aTag in tdTag.findAll("a"): if aTag.string != None: result.append(aTag.string) for idx, _ in enumerate(result): # idx & _ are name of our values in table of results (1:Andre Breton) result[idx] = str(result[idx].replace(' ', '_')) # Reformatting the data to get correct link to get access to Biographie return result # Function for verification of personage data def is_valid_personage_line(datum): # Introduce initial values date = datum[0].strip() city = None info = None # Define the type of datum by their length if len(datum) == 3: # We have date, city and info city = datum[1].strip() info = datum[2].strip() elif len(datum) == 2: # We have date and info info = datum[1].strip() else: return False # Validate city & Validate date if not date_reexp.match(date): # Date don't pass the verification of pattern return False if city: places = GeoText(city) if len(places.cities) == 0: if city.count(' ') > 2 or '1' in city: # Check if in place of city we add date or not correct name of city return False return True # Function for generate summarizing of Biographie def generate_text_for_personage(name, data): result = '' last_city = None for datum in data: # Checking all datum in list of data date = datum[0] # Remove last dot from date if date[-1] == '.': date = date[:-1] city = datum[1].strip() if len(datum) == 3 else None info = datum[-1] # Remove incorrect information if len(info) < 5: continue # Remove last dot from datum-info (sentence) if info[-1] == '.': info = info[:-1] # Add some prefix to date, to make text more readably date_prefix = 'En ' if len(date) == 4 else 'Le ' # Removing city in case if city repat for two consecutive datum or put prefix need_city = city != last_city and city != '-' city_postfix = ((' a ' + city) if city and need_city else '') + '. ' result += date_prefix + date + ' ' + info + city_postfix # Final info, more readable sentence last_city = city return result def main(*args): pages = "" if len(args) == 0: pages = get_all_personages() else: pages = args data = {} # List of key:value for all data find correct_data_ratio = {} # List of key:value which keep ratio of lossing info for each personnage # Inital values total_number_of_content_lines = 0 total_number_of_correct_lines = 0 # Collecting all data from pages of Biograhpie for personage in pages: site = ("http://wikipast.epfl.ch/wikipast/index.php/" + quote(personage)) response = urlopen(site) page_source = response.read() soup = BeautifulSoup(page_source,'html.parser') content_div = soup.find(id="mw-content-text") # Defined the principle block of Biographie for each personage content_lines = content_div.findAll("li") # Defined line of datum data[personage] = [] # Relate data to personage total_number_of_content_lines += len(content_lines) # Counting the total number of lines which we find for content_line in content_lines: # Treatment each line in data we extrect for each personnage content_text = content_line.get_text() # From datum extract text content_match = content_reexp.match(content_text) # Checking our extracting text with pattern of text if content_match: # If checking succeed findings = content_reexp.findall(content_text) findings_as_array = [x for xs in findings for x in xs] # Made the array of content and personage findings_no_duplicates = list(OrderedDict.fromkeys(findings_as_array)) findings_no_duplicates.pop(1) # Removing duplicated datum # Check if last is a reference if reference_reexp.match(findings_no_duplicates[-1]): findings_no_duplicates.pop(-1) if is_valid_personage_line(findings_no_duplicates): data[personage].append(findings_no_duplicates) if len(content_lines) != 0: correct_data_ratio[personage] = len(data[personage]) / len(content_lines) # The ratio of correct data for personnage else: correct_data_ratio[personage] = 1 # It's mean that we read zero lines in Biographie of personnage total_number_of_correct_lines += len(data[personage]) # Counting total number of lines which we can process to use in summarizing print(generate_text_for_personage(personage, data[personage])) # Checking of text which be used for summarizing correct_data_ratio_overall = total_number_of_correct_lines / total_number_of_content_lines # Checking percentage of data losing after all iteration print(correct_data_ratio_overall)