« SourceBot » : différence entre les versions
Aucun résumé des modifications |
(→Code) |
||
Ligne 11 : | Ligne 11 : | ||
== Code == | == Code == | ||
# coding: utf-8 | |||
# In[5]: | |||
import urllib | |||
import requests | |||
from bs4 import BeautifulSoup | |||
import re | |||
user='Vlaedr' | |||
passw='Alextall007' | |||
baseurl='http://wikipast.epfl.ch/wikipast/' | |||
summary='Wikipastbot update' | |||
#names=['Daniel Brélaz'] | |||
# In[6]: | |||
# Login request | |||
payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'} | |||
r1=requests.post(baseurl + 'api.php', data=payload) | |||
#login confirm | |||
login_token=r1.json()['query']['tokens']['logintoken'] | |||
payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token} | |||
r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies) | |||
#get edit token2 | |||
params3='?format=json&action=query&meta=tokens&continue=' | |||
r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies) | |||
edit_token=r3.json()['query']['tokens']['csrftoken'] | |||
edit_cookie=r2.cookies.copy() | |||
edit_cookie.update(r3.cookies) | |||
# In[7]: | |||
fullbios = '' | |||
name = 'Biographies' | |||
result=requests.post(baseurl+'api.php?action=query&titles='+name+'&export&exportnowrap') | |||
soup=BeautifulSoup(result.text, "lxml") | |||
#soup=BeautifulSoup(result.text) | |||
code='' | |||
for primitive in soup.findAll("text"): | |||
code+=primitive.string | |||
fullbios = code | |||
# In[8]: | |||
allnames = [] | |||
pattern = '\* ?\[\[([^\]]*)\]\].*' | |||
p = re.compile(pattern) | |||
bioNames = fullbios.split('\n') | |||
for c in bioNames: | |||
tk = c.split('\n') | |||
for t in tk: | |||
if t: | |||
match = p.match(t) | |||
if match: | |||
allnames.append(match.group(1)) | |||
# In[9]: | |||
fullCode = [] | |||
for name in allnames: | |||
result=requests.post(baseurl+'api.php?action=query&titles='+name+'&export&exportnowrap') | |||
soup=BeautifulSoup(result.text, "lxml") | |||
#soup=BeautifulSoup(result.text) | |||
code='' | |||
for primitive in soup.findAll("text"): | |||
code += primitive.string | |||
fullCode.append((code, name)) | |||
#print(fullCode) | |||
# In[23]: | |||
# the argument is the plain text of the page | |||
# this fonction returns a tuple: | |||
# first element is a boolean: true if all entries are sourced | |||
# false if sources are missing | |||
# second element is a list of : (modification) all dates where the source is wrong. | |||
#int: all the bad lines indexes | |||
def findBadPage(pageText): | |||
# get all the lines | |||
tokens = [] | |||
tk = pageText.split('\n') | |||
for t in tk: | |||
if t: | |||
if t[0] == '*': | |||
tokens.append(t) | |||
#check if line is sourced | |||
r = '\*.*\[https?:\/\/w?w?w?\.?letemps[^\]]*\].*' | |||
p = re.compile(r) | |||
dates_isolator_expr = ['\* \[\[(.*)\]\] \/', '\*\[\[(.*)\]\]\:', '\*\[\[(.*)\]\]\/'] | |||
index = 0 | |||
allSourced = True | |||
wrongDatesSources = [] | |||
for t in tokens: | |||
match = p.match(t) | |||
if not match: | |||
print(t) | |||
allSourced = False | |||
#d = re.compile(r'(\* \[\[(.*)\]\] \/|\*\[\[(.*)\]\]\/|\[\[(.*)\]\]\:)') | |||
count = 0; | |||
didmatch = False | |||
for i in dates_isolator_expr : | |||
count +=1 | |||
d = re.compile(i) | |||
match = d.match(t) | |||
if match: | |||
didmatch = True | |||
wrongDatesSources.append(match.group(1)) | |||
#print(match.group(1)) | |||
if count == 3 and not didmatch : | |||
wrongDatesSources.append('false source at line: ' + str(index)) | |||
otherSource = '\*.*\[(https?:\/\/.*)\].*' | |||
pOth = re.compile(otherSource) | |||
match = pOth.match(t) | |||
index +=1 | |||
return (allSourced, wrongDatesSources) | |||
# In[24]: | |||
content = '\n' | |||
content += 'Cette page liste toutes les biographies ayant des entrées non sourcées.' | |||
content += '\n==Bad page==\n' | |||
badpage = [] | |||
wrongDatesSources = [] | |||
ok =True | |||
for (c, name) in fullCode: | |||
(ok, wrongDatesSources) = findBadPage(c) | |||
if not ok: | |||
print(name) | |||
badpage.append(name) | |||
content += name | |||
content += '\n The wrong entries in this page are the sources with the following dates: ' | |||
#indexPages = [] | |||
content += str(wrongDatesSources) | |||
content += '\n\n' | |||
# In[22]: | |||
'''content = '\n' | |||
content += 'Cette page liste toutes les biographies ayant des entrées non sourcées.' | |||
content += '\n==Bad page==' | |||
content += '\n[[' + badpage[0] + ']]' ''' | |||
payload = {'action':'edit','assert':'user','format':'json','utf8':'','text':content, | |||
'summary':summary,'title':'FactChecking','token':edit_token} | |||
r4=requests.post(baseurl+'api.php',data=payload,cookies=edit_cookie) |
Version du 9 mai 2017 à 12:00
Description
Le bot détecte l'absence éventuelle d'une source dans une ligne biographique et la signale Il met à jour une page FactChecking qui liste les entrées non sourcées en associant une référence particulière à cet inconnue Il scanne régulièrement la page, si la sourcée est ajoutée il a remet dans la biographie.
Exemples
Performances
Code
- coding: utf-8
- In[5]:
import urllib import requests from bs4 import BeautifulSoup import re
user='Vlaedr' passw='Alextall007' baseurl='http://wikipast.epfl.ch/wikipast/' summary='Wikipastbot update'
- names=['Daniel Brélaz']
- In[6]:
- Login request
payload={'action':'query','format':'json','utf8':,'meta':'tokens','type':'login'} r1=requests.post(baseurl + 'api.php', data=payload)
- login confirm
login_token=r1.json()['query']['tokens']['logintoken'] payload={'action':'login','format':'json','utf8':,'lgname':user,'lgpassword':passw,'lgtoken':login_token} r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies)
- get edit token2
params3='?format=json&action=query&meta=tokens&continue=' r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies) edit_token=r3.json()['query']['tokens']['csrftoken']
edit_cookie=r2.cookies.copy() edit_cookie.update(r3.cookies)
- In[7]:
fullbios =
name = 'Biographies'
result=requests.post(baseurl+'api.php?action=query&titles='+name+'&export&exportnowrap')
soup=BeautifulSoup(result.text, "lxml")
- soup=BeautifulSoup(result.text)
code= for primitive in soup.findAll("text"):
code+=primitive.string
fullbios = code
- In[8]:
allnames = [] pattern = '\* ?\[\[([^\]]*)\]\].*' p = re.compile(pattern) bioNames = fullbios.split('\n') for c in bioNames:
tk = c.split('\n') for t in tk: if t: match = p.match(t) if match: allnames.append(match.group(1))
- In[9]:
fullCode = [] for name in allnames:
result=requests.post(baseurl+'api.php?action=query&titles='+name+'&export&exportnowrap') soup=BeautifulSoup(result.text, "lxml") #soup=BeautifulSoup(result.text) code= for primitive in soup.findAll("text"): code += primitive.string fullCode.append((code, name))
- print(fullCode)
- In[23]:
- the argument is the plain text of the page
- this fonction returns a tuple:
- first element is a boolean: true if all entries are sourced
- false if sources are missing
- second element is a list of : (modification) all dates where the source is wrong.
- int: all the bad lines indexes
def findBadPage(pageText):
# get all the lines tokens = [] tk = pageText.split('\n') for t in tk: if t: if t[0] == '*': tokens.append(t) #check if line is sourced r = '\*.*\[https?:\/\/w?w?w?\.?letemps[^\]]*\].*' p = re.compile(r) dates_isolator_expr = ['\* \[\[(.*)\]\] \/', '\*\[\[(.*)\]\]\:', '\*\[\[(.*)\]\]\/'] index = 0 allSourced = True wrongDatesSources = [] for t in tokens: match = p.match(t) if not match: print(t) allSourced = False #d = re.compile(r'(\* \[\[(.*)\]\] \/|\*\[\[(.*)\]\]\/|\[\[(.*)\]\]\:)') count = 0; didmatch = False for i in dates_isolator_expr : count +=1 d = re.compile(i) match = d.match(t) if match: didmatch = True wrongDatesSources.append(match.group(1)) #print(match.group(1)) if count == 3 and not didmatch : wrongDatesSources.append('false source at line: ' + str(index)) otherSource = '\*.*\[(https?:\/\/.*)\].*' pOth = re.compile(otherSource) match = pOth.match(t) index +=1 return (allSourced, wrongDatesSources)
- In[24]:
content = '\n' content += 'Cette page liste toutes les biographies ayant des entrées non sourcées.' content += '\n==Bad page==\n'
badpage = [] wrongDatesSources = [] ok =True for (c, name) in fullCode:
(ok, wrongDatesSources) = findBadPage(c) if not ok: print(name) badpage.append(name) content += name content += '\n The wrong entries in this page are the sources with the following dates: ' #indexPages = [] content += str(wrongDatesSources) content += '\n\n'
- In[22]:
content = '\n' content += 'Cette page liste toutes les biographies ayant des entrées non sourcées.' content += '\n==Bad page==' content += '\n[[' + badpage[0] + ']]' payload = {'action':'edit','assert':'user','format':'json','utf8':,'text':content,
'summary':summary,'title':'FactChecking','token':edit_token}
r4=requests.post(baseurl+'api.php',data=payload,cookies=edit_cookie)