« SourceBot » : différence entre les versions
(→Code) |
(→Code) |
||
Ligne 20 : | Ligne 20 : | ||
<nowiki> | <nowiki> | ||
# coding: utf-8 | # coding: utf-8 | ||
import urllib | import urllib | ||
Ligne 33 : | Ligne 31 : | ||
summary='Wikipastbot update' | summary='Wikipastbot update' | ||
#names=['Daniel Brélaz'] | #names=['Daniel Brélaz'] | ||
# Login request | # Login request | ||
Ligne 53 : | Ligne 48 : | ||
edit_cookie=r2.cookies.copy() | edit_cookie=r2.cookies.copy() | ||
edit_cookie.update(r3.cookies) | edit_cookie.update(r3.cookies) | ||
Ligne 62 : | Ligne 54 : | ||
result=requests.post(baseurl+'api.php?action=query&titles='+name+'&export&exportnowrap') | result=requests.post(baseurl+'api.php?action=query&titles='+name+'&export&exportnowrap') | ||
soup=BeautifulSoup(result.text, "lxml") | soup=BeautifulSoup(result.text, "lxml") | ||
code='' | code='' | ||
for primitive in soup.findAll("text"): | for primitive in soup.findAll("text"): | ||
Ligne 68 : | Ligne 59 : | ||
fullbios = code | fullbios = code | ||
allnames = [] | allnames = [] | ||
Ligne 83 : | Ligne 72 : | ||
allnames.append(match.group(1)) | allnames.append(match.group(1)) | ||
fullCode = [] | fullCode = [] | ||
Ligne 96 : | Ligne 82 : | ||
code += primitive.string | code += primitive.string | ||
fullCode.append((code, name)) | fullCode.append((code, name)) | ||
# the argument is the plain text of the page | # the argument is the plain text of the page | ||
Ligne 105 : | Ligne 89 : | ||
# first element is a boolean: true if all entries are sourced | # first element is a boolean: true if all entries are sourced | ||
# false if sources are missing | # false if sources are missing | ||
# second element is a list of : | # second element is a list of : all dates where the source is wrong. | ||
#int: all the bad lines indexes | #int: all the bad lines indexes | ||
def findBadPage(pageText): | def findBadPage(pageText): | ||
Ligne 129 : | Ligne 113 : | ||
print(t) | print(t) | ||
allSourced = False | allSourced = False | ||
count = 0; | count = 0; | ||
didmatch = False | didmatch = False | ||
#Check if dates in any of the tree forms. | |||
for i in dates_isolator_expr : | for i in dates_isolator_expr : | ||
count +=1 | count +=1 | ||
d = re.compile(i) | d = re.compile(i) | ||
match = d.match(t) | match = d.match(t) | ||
Ligne 140 : | Ligne 123 : | ||
didmatch = True | didmatch = True | ||
wrongDatesSources.append(match.group(1)) | wrongDatesSources.append(match.group(1)) | ||
if count == 3 and not didmatch : | if count == 3 and not didmatch : | ||
wrongDatesSources.append('false source at line: ' + str(index)) | wrongDatesSources.append('false source at line: ' + str(index)) | ||
Ligne 148 : | Ligne 131 : | ||
index +=1 | index +=1 | ||
return (allSourced, wrongDatesSources) | return (allSourced, wrongDatesSources) | ||
content = '\n' | content = '\n' | ||
Ligne 162 : | Ligne 142 : | ||
(ok, wrongDatesSources) = findBadPage(c) | (ok, wrongDatesSources) = findBadPage(c) | ||
if not ok: | if not ok: | ||
badpage.append(name) | badpage.append(name) | ||
content += name | content += name | ||
content += '\n The wrong entries in this page are the sources with the following dates: ' | content += '\n The wrong entries in this page are the sources with the following dates: ' | ||
content += str(wrongDatesSources) | content += str(wrongDatesSources) | ||
content += '\n\n' | content += '\n\n' | ||
#send to page FactChecking | |||
# | |||
'''content = '\n' | '''content = '\n' | ||
content += 'Cette page liste toutes les biographies ayant des entrées non sourcées.' | content += 'Cette page liste toutes les biographies ayant des entrées non sourcées.' |
Version du 9 mai 2017 à 12:22
Description
Le bot détecte l'absence éventuelle d'une source dans une ligne biographique et la signale Il met à jour une page FactChecking qui liste les entrées non sourcées en associant une référence particulière à cet inconnue Il scanne régulièrement la page, si la sourcée est ajoutée il a remet dans la biographie.
Exemples
- 1975 / Lausanne Diplôme en mathématiques à l'EPFL. [1]
- 1978 / Lausanne Election de Daniel Brélaz à l'Assemblée fédérale.
Notre bot recupère la page de Daniel Brélaz et elle retourne les dates de toute les entrées non-sourcées ou avec une autre source que letemps.ch . Il écrit sur FactChecking The wrong entries in this page are the sources with the following dates: [ 1975 , 1978 ]
Performances
Notre bot repère toutes les entrées des pages des Datafication Biographiques qui sont pas ou mal soourcée: Ensuite celui-ci met à jour la page FactChecking en écrivant la ou les date manquantes. Si le format de la date n'est pas sous la forme : *[ ["date"] ]/ ou *[ ["date"] ]: ou le format standard * [ ["date"] ] / alors l'affichage des sources mal sourcées sur FactChecking sera moins performant.
Code
# coding: utf-8 import urllib import requests from bs4 import BeautifulSoup import re user='Vlaedr' passw='Alextall007' baseurl='http://wikipast.epfl.ch/wikipast/' summary='Wikipastbot update' #names=['Daniel Brélaz'] # Login request payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'} r1=requests.post(baseurl + 'api.php', data=payload) #login confirm login_token=r1.json()['query']['tokens']['logintoken'] payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token} r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies) #get edit token2 params3='?format=json&action=query&meta=tokens&continue=' r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies) edit_token=r3.json()['query']['tokens']['csrftoken'] edit_cookie=r2.cookies.copy() edit_cookie.update(r3.cookies) fullbios = '' name = 'Biographies' result=requests.post(baseurl+'api.php?action=query&titles='+name+'&export&exportnowrap') soup=BeautifulSoup(result.text, "lxml") code='' for primitive in soup.findAll("text"): code+=primitive.string fullbios = code allnames = [] pattern = '\* ?\[\[([^\]]*)\]\].*' p = re.compile(pattern) bioNames = fullbios.split('\n') for c in bioNames: tk = c.split('\n') for t in tk: if t: match = p.match(t) if match: allnames.append(match.group(1)) fullCode = [] for name in allnames: result=requests.post(baseurl+'api.php?action=query&titles='+name+'&export&exportnowrap') soup=BeautifulSoup(result.text, "lxml") #soup=BeautifulSoup(result.text) code='' for primitive in soup.findAll("text"): code += primitive.string fullCode.append((code, name)) # the argument is the plain text of the page # this fonction returns a tuple: # first element is a boolean: true if all entries are sourced # false if sources are missing # second element is a list of : all dates where the source is wrong. #int: all the bad lines indexes def findBadPage(pageText): # get all the lines tokens = [] tk = pageText.split('\n') for t in tk: if t: if t[0] == '*': tokens.append(t) #check if line is sourced r = '\*.*\[https?:\/\/w?w?w?\.?letemps[^\]]*\].*' p = re.compile(r) dates_isolator_expr = ['\* \[\[(.*)\]\] \/', '\*\[\[(.*)\]\]\:', '\*\[\[(.*)\]\]\/'] index = 0 allSourced = True wrongDatesSources = [] for t in tokens: match = p.match(t) if not match: print(t) allSourced = False count = 0; didmatch = False #Check if dates in any of the tree forms. for i in dates_isolator_expr : count +=1 d = re.compile(i) match = d.match(t) if match: didmatch = True wrongDatesSources.append(match.group(1)) if count == 3 and not didmatch : wrongDatesSources.append('false source at line: ' + str(index)) otherSource = '\*.*\[(https?:\/\/.*)\].*' pOth = re.compile(otherSource) match = pOth.match(t) index +=1 return (allSourced, wrongDatesSources) content = '\n' content += 'Cette page liste toutes les biographies ayant des entrées non sourcées.' content += '\n==Bad page==\n' badpage = [] wrongDatesSources = [] ok =True for (c, name) in fullCode: (ok, wrongDatesSources) = findBadPage(c) if not ok: badpage.append(name) content += name content += '\n The wrong entries in this page are the sources with the following dates: ' content += str(wrongDatesSources) content += '\n\n' #send to page FactChecking '''content = '\n' content += 'Cette page liste toutes les biographies ayant des entrées non sourcées.' content += '\n==Bad page==' content += '\n[[' + badpage[0] + ']]' ''' payload = {'action':'edit','assert':'user','format':'json','utf8':'','text':content, 'summary':summary,'title':'FactChecking','token':edit_token} r4=requests.post(baseurl+'api.php',data=payload,cookies=edit_cookie)