« Code SourceBot » : différence entre les versions
Aller à la navigation
Aller à la recherche
(Page créée avec « <nowiki> import urllib import requests from bs4 import BeautifulSoup import re user='MasterBot' passw='dhbot2019' baseurl='http://wikipast.epfl.ch/wikipast/' summary='Wi... ») |
(Aucune différence)
|
Dernière version du 7 mai 2019 à 11:21
import urllib
import requests
from bs4 import BeautifulSoup
import re
user='MasterBot'
passw='dhbot2019'
baseurl='http://wikipast.epfl.ch/wikipast/'
summary='Wikipastbot update'
# Login request
payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'}
r1=requests.post(baseurl + 'api.php', data=payload)
#login confirm
login_token=r1.json()['query']['tokens']['logintoken']
payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token}
r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies)
#get edit token2
params3='?format=json&action=query&meta=tokens&continue='
r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies)
edit_token=r3.json()['query']['tokens']['csrftoken']
edit_cookie=r2.cookies.copy()
edit_cookie.update(r3.cookies)
fullbios = ''
name = 'Biographies'
result=requests.post(baseurl+'api.php?action=query&titles='+name+'&export&exportnowrap')
soup=BeautifulSoup(result.text, "lxml")
code=''
for primitive in soup.findAll("text"):
code+=primitive.string
fullbios = code
protected_logins=["Frederickaplan","Maud","Vbuntinx","Testbot","SparqlBot","IB","SourceBot","PageUpdaterBot","Orthobot","BioPathBot","ChronoBOT","Amonbaro","AntoineL","AntoniasBanderos","Arnau","Arnaudpannatier","Aureliver","Brunowicht","Burgerpop","Cedricviaccoz","Christophe","Claudioloureiro","Ghislain","Gregoire3245","Hirtg","Houssm","Icebaker","JenniCin","JiggyQ","JulienB","Kl","Kperrard","Leandro Kieliger","Marcus","Martin","MatteoGiorla","Mireille","Mj2905","Musluoglucem","Nacho","Nameless","Nawel","O'showa","PA","Qantik","QuentinB","Raphael.barman","Roblan11","Romain Fournier","Sbaaa","Snus","Sonia","Tboyer","Thierry","Titi","Vlaedr","Wanda"]
depuis_date='2017-05-02T16:00:00Z'
#pages created by Bots or people in the class but not the ones like Fichier:Annonce biopath.png
liste_pages=[]
for user in protected_logins:
result=requests.post(baseurl+'api.php?action=query&list=usercontribs&ucuser='+user+'&format=xml&ucend='+depuis_date)
soup=BeautifulSoup(result.content,'lxml')
for primitive in soup.usercontribs.findAll('item'):
if primitive['title'][:7] != 'Fichier':
liste_pages.append(primitive['title'])
#pages not to be considered.
bad_pages_list = ['Accueil', 'Bots','HypermotBot', 'OrthoBot', 'SourceBot','VandalBot','PageUpdaterBot', 'BioPathBot', 'SPARQLBot','InferenceBot/CheckerBot','LinkBot','CheckerBot', 'InferenceBot', 'MiningBot','Chronobot', 'ChronoBot', 'ImageBot','FormatBot', 'TangoBot']
liste_pages_correct = list(set(liste_pages) - set(bad_pages_list))
allnames = []
for p in liste_pages_correct :
allnames.append(p)
pattern = '\* ?\[\[([^\]]*)\]\].*'
p = re.compile(pattern)
bioNames = fullbios.split('\n')
for c in bioNames:
tk = c.split('\n')
for t in tk:
if t:
match = p.match(t)
if match:
allnames.append(match.group(1))
# the argument is the plain text of the page
# this fonction returns a tuple:
# first element is a boolean: true if all entries are sourced
# false if sources are missing
# second element is a list of : (modification) all dates where the source is wrong.
#int: all the bad lines indexes
def findBadPage(pageText):
# get all the lines
tokens = []
tk = pageText.split('\n')
for t in tk:
if t:
if t[0] == '*':
tokens.append(t)
#check if line is sourced
r = '\*.*\[https?:\/\/w?w?w?\.?letemps[^\]]*\].*'
p = re.compile(r)
dates_isolator_expr = ['\* \[\[(.*)\]\] \/', '\*\[\[(.*)\]\]\:', '\*\[\[(.*)\]\]\/', '\*\[\[(.*)\]\] \/']
index = 0
allSourced = True
wrongDatesSources = []
for t in tokens:
match = p.match(t)
if not match:
allSourced = False
count = 0;
didmatch = False
for i in dates_isolator_expr :
count +=1
d = re.compile(i)
match = d.match(t)
if match:
didmatch = True
wrongDatesSources.append(match.group(1))
if not didmatch :
wrongDatesSources.append('false source at line: ' + str(index))
otherSource = '\*.*\[(https?:\/\/.*)\].*'
pOth = re.compile(otherSource)
match = pOth.match(t)
index +=1
return (allSourced, wrongDatesSources)
def main(*args):
names = ""
if len(args) == 0:
names = allnames
else:
names = args
fullCode = []
for name in names:
result=requests.post(baseurl+'api.php?action=query&titles='+name+'&export&exportnowrap')
soup=BeautifulSoup(result.text, "lxml")
code=''
for primitive in soup.findAll("text"):
if primitive.string :
code += primitive.string
if code :
fullCode.append((code, name))
content = '\n'
content += 'Cette page liste toutes les biographies ayant des entrées non sourcées.'
content += '\n==Bad page==\n'
badpage = []
wrongDatesSources = []
ok =True
for (c, name) in fullCode:
(ok, wrongDatesSources) = findBadPage(c)
if not ok:
badpage.append(name)
content += name
content += '\n The wrong entries in this page are the sources with the following dates: '
content += str(wrongDatesSources)
content += '\n\n'
payload = {'action':'edit','assert':'user','format':'json','utf8':'','text':content,
'summary':summary,'title':'FactChecking','token':edit_token}
r4=requests.post(baseurl+'api.php',data=payload,cookies=edit_cookie)