Code SourceBot
Aller à la navigation
Aller à la recherche
import urllib import requests from bs4 import BeautifulSoup import re user='MasterBot' passw='dhbot2019' baseurl='http://wikipast.epfl.ch/wikipast/' summary='Wikipastbot update' # Login request payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'} r1=requests.post(baseurl + 'api.php', data=payload) #login confirm login_token=r1.json()['query']['tokens']['logintoken'] payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token} r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies) #get edit token2 params3='?format=json&action=query&meta=tokens&continue=' r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies) edit_token=r3.json()['query']['tokens']['csrftoken'] edit_cookie=r2.cookies.copy() edit_cookie.update(r3.cookies) fullbios = '' name = 'Biographies' result=requests.post(baseurl+'api.php?action=query&titles='+name+'&export&exportnowrap') soup=BeautifulSoup(result.text, "lxml") code='' for primitive in soup.findAll("text"): code+=primitive.string fullbios = code protected_logins=["Frederickaplan","Maud","Vbuntinx","Testbot","SparqlBot","IB","SourceBot","PageUpdaterBot","Orthobot","BioPathBot","ChronoBOT","Amonbaro","AntoineL","AntoniasBanderos","Arnau","Arnaudpannatier","Aureliver","Brunowicht","Burgerpop","Cedricviaccoz","Christophe","Claudioloureiro","Ghislain","Gregoire3245","Hirtg","Houssm","Icebaker","JenniCin","JiggyQ","JulienB","Kl","Kperrard","Leandro Kieliger","Marcus","Martin","MatteoGiorla","Mireille","Mj2905","Musluoglucem","Nacho","Nameless","Nawel","O'showa","PA","Qantik","QuentinB","Raphael.barman","Roblan11","Romain Fournier","Sbaaa","Snus","Sonia","Tboyer","Thierry","Titi","Vlaedr","Wanda"] depuis_date='2017-05-02T16:00:00Z' #pages created by Bots or people in the class but not the ones like Fichier:Annonce biopath.png liste_pages=[] for user in protected_logins: result=requests.post(baseurl+'api.php?action=query&list=usercontribs&ucuser='+user+'&format=xml&ucend='+depuis_date) soup=BeautifulSoup(result.content,'lxml') for primitive in soup.usercontribs.findAll('item'): if primitive['title'][:7] != 'Fichier': liste_pages.append(primitive['title']) #pages not to be considered. bad_pages_list = ['Accueil', 'Bots','HypermotBot', 'OrthoBot', 'SourceBot','VandalBot','PageUpdaterBot', 'BioPathBot', 'SPARQLBot','InferenceBot/CheckerBot','LinkBot','CheckerBot', 'InferenceBot', 'MiningBot','Chronobot', 'ChronoBot', 'ImageBot','FormatBot', 'TangoBot'] liste_pages_correct = list(set(liste_pages) - set(bad_pages_list)) allnames = [] for p in liste_pages_correct : allnames.append(p) pattern = '\* ?\[\[([^\]]*)\]\].*' p = re.compile(pattern) bioNames = fullbios.split('\n') for c in bioNames: tk = c.split('\n') for t in tk: if t: match = p.match(t) if match: allnames.append(match.group(1)) # the argument is the plain text of the page # this fonction returns a tuple: # first element is a boolean: true if all entries are sourced # false if sources are missing # second element is a list of : (modification) all dates where the source is wrong. #int: all the bad lines indexes def findBadPage(pageText): # get all the lines tokens = [] tk = pageText.split('\n') for t in tk: if t: if t[0] == '*': tokens.append(t) #check if line is sourced r = '\*.*\[https?:\/\/w?w?w?\.?letemps[^\]]*\].*' p = re.compile(r) dates_isolator_expr = ['\* \[\[(.*)\]\] \/', '\*\[\[(.*)\]\]\:', '\*\[\[(.*)\]\]\/', '\*\[\[(.*)\]\] \/'] index = 0 allSourced = True wrongDatesSources = [] for t in tokens: match = p.match(t) if not match: allSourced = False count = 0; didmatch = False for i in dates_isolator_expr : count +=1 d = re.compile(i) match = d.match(t) if match: didmatch = True wrongDatesSources.append(match.group(1)) if not didmatch : wrongDatesSources.append('false source at line: ' + str(index)) otherSource = '\*.*\[(https?:\/\/.*)\].*' pOth = re.compile(otherSource) match = pOth.match(t) index +=1 return (allSourced, wrongDatesSources) def main(*args): names = "" if len(args) == 0: names = allnames else: names = args fullCode = [] for name in names: result=requests.post(baseurl+'api.php?action=query&titles='+name+'&export&exportnowrap') soup=BeautifulSoup(result.text, "lxml") code='' for primitive in soup.findAll("text"): if primitive.string : code += primitive.string if code : fullCode.append((code, name)) content = '\n' content += 'Cette page liste toutes les biographies ayant des entrées non sourcées.' content += '\n==Bad page==\n' badpage = [] wrongDatesSources = [] ok =True for (c, name) in fullCode: (ok, wrongDatesSources) = findBadPage(c) if not ok: badpage.append(name) content += name content += '\n The wrong entries in this page are the sources with the following dates: ' content += str(wrongDatesSources) content += '\n\n' payload = {'action':'edit','assert':'user','format':'json','utf8':'','text':content, 'summary':summary,'title':'FactChecking','token':edit_token} r4=requests.post(baseurl+'api.php',data=payload,cookies=edit_cookie)