« LinkBot » : différence entre les versions
Aller à la navigation
Aller à la recherche
(→Code) |
(Annulation des modifications 37661 de Orthobot (discussion)) |
||
| (14 versions intermédiaires par 3 utilisateurs non affichées) | |||
| Ligne 10 : | Ligne 10 : | ||
==Performances== | ==Performances== | ||
La section "Wikipedia" a pu être ajoutée sur tous les articles de datafication biographique de la page [[Biographies]]. | La section "Wikipedia" a pu être ajoutée sur tous les articles de datafication biographique de la page [[Biographies]]. | ||
== | ==Planification== | ||
Le bot doit être activé à chaque nouvel article d'un utilisateur agréé, ou à chaque modification du nom d'un article d'un utilisateur agréé. Si cela n'est pas possible, il peut être activé chaque semaine, ce qui semble être une fréquence adaptée au Bot. | Le bot doit être activé à chaque nouvel article d'un utilisateur agréé, ou à chaque modification du nom d'un article d'un utilisateur agréé. Si cela n'est pas possible, il peut être activé chaque semaine, ce qui semble être une fréquence adaptée au Bot. | ||
==Code== | ==Code== | ||
LinkBot | <nowiki> | ||
# -*- coding: utf-8 -*- | |||
import lxml | |||
import requests | |||
from bs4 import BeautifulSoup | |||
import wikipedia | |||
import re | |||
import sys | |||
# login, settings and format data | |||
user='TotallyNotBot' | |||
passw=XXX | |||
baseurl='http://wikipast.epfl.ch/wikipast/' | |||
summary='LinkBot update' | |||
wikipediaHeader = '==Wikipedia==' | |||
# Used for regex search in removeWikipediaContent | |||
wikipedia.set_lang("fr") | |||
# Constants | |||
pageNotFound = -1 | |||
wikipediaLinkNotFound = -2 | |||
# - - - - - - - | |||
# Returns all the pages modified and added by the users of the course. | |||
def getPageList(): | |||
protected_logins=["Frederickaplan","Maud","Vbuntinx","Testbot","SparqlBot","IB","SourceBot","PageUpdaterBot","Orthobot","BioPathBot","ChronoBOT","Amonbaro","AntoineL","AntoniasBanderos","Arnau","Arnaudpannatier","Aureliver","Brunowicht","Burgerpop","Cedricviaccoz","Christophe","Claudioloureiro","Ghislain","Gregoire3245","Hirtg","Houssm","Icebaker","JenniCin","JiggyQ","JulienB","Kl","Kperrard","Leandro Kieliger","Marcus","Martin","MatteoGiorla","Mireille","Mj2905","Musluoglucem","Nacho","Nameless","Nawel","O'showa","PA","Qantik","QuentinB","Raphael.barman","Roblan11","Romain Fournier","Sbaaa","Snus","Sonia","Tboyer","Thierry","Titi","Vlaedr","Wanda"] | |||
liste_pages=[] | |||
for user in protected_logins: | |||
result=requests.post(baseurl+'api.php?action=query&list=usercontribs&ucuser='+user+'&format=xml') | |||
soup=BeautifulSoup(result.content,'lxml') | |||
for primitive in soup.usercontribs.findAll('item'): | |||
liste_pages.append(primitive['title']) | |||
liste_pages=list(set(liste_pages)) | |||
return liste_pages | |||
# get wikipedia link from page | |||
# return format should be a string (e.g. page='Cern' returns 'https://en.wikipedia.org/wiki/CERN') | |||
# if it cannot find the link, it should return the empty string ''. | |||
def getWikipediaLink(page): | |||
try: | |||
page_wiki = wikipedia.page(page) | |||
return page_wiki.url | |||
except: | |||
return '' | |||
# Get content returns the current content of a wiki page. Returns | |||
# pageNotFound if either the page could not be found, or if the content | |||
# is empty. | |||
def getContent(page): | |||
try: | |||
result=requests.post(baseurl+'api.php?action=query&titles='+page+'&export&exportnowrap') | |||
soup=BeautifulSoup(result.text,'lxml') | |||
data='' | |||
for primitive in soup.findAll("text"): | |||
# Check if the article is empty, if it is empty, return the empty string | |||
try: | |||
data+=primitive.string | |||
except: | |||
return data | |||
return data | |||
except: | |||
return pageNotFound | |||
# Adds a new content (newContent) to the page | |||
def addContent(page, newContent): | |||
# Login request | |||
payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'} | |||
r1=requests.post(baseurl + 'api.php', data=payload) | |||
#login confirm | |||
login_token=r1.json()['query']['tokens']['logintoken'] | |||
payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token} | |||
r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies) | |||
#get edit token2 | |||
params3='?format=json&action=query&meta=tokens&continue=' | |||
r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies) | |||
edit_token=r3.json()['query']['tokens']['csrftoken'] | |||
edit_cookie=r2.cookies.copy() | |||
edit_cookie.update(r3.cookies) | |||
payload={'action':'edit','assert':'user','format':'json','utf8':'','text':newContent,'summary':summary,'title':page,'token':edit_token} | |||
r4=requests.post(baseurl+'api.php',data=payload,cookies=edit_cookie) | |||
# Add wikipedia content adds the old content with the wikipediaContent appeneded at the end of the page. | |||
def addWikipediaContent(page, oldContent, wikipediaLink): | |||
wikipediaContent = wikipediaHeader + '\n' + wikipediaLink | |||
newContent = oldContent + '\n' + wikipediaContent | |||
addContent(page, newContent) | |||
# Removes the wikipedia content from a page by using regex patterns. | |||
def removeWikipediaContent(page): | |||
oldContent = getContent(page) | |||
if (oldContent == pageNotFound): | |||
return | |||
if re.search(wikipediaHeader, oldContent) is not None: | |||
stringArray = oldContent.split('\n') | |||
newContent = '' | |||
headerFound = False | |||
for line in stringArray: | |||
if not headerFound: | |||
if line == wikipediaHeader: | |||
headerFound = True | |||
else: | |||
newContent += (line + '\n') | |||
else: | |||
headerFound = False | |||
addContent(page, newContent) | |||
# Main program which for every page in getPages(), (i.e. the biographies) if the page has no wikipedia content, | |||
# then it will add the corresponding wikpedia link. | |||
def main(): | |||
print('LinkBot starting...') | |||
# load pages ]o add Wikipedia links | |||
page_list = getPageList() | |||
# lists for data (which links could not be found, which have been added already?) | |||
page_list_without_wikipedia_link = [] | |||
page_list_already_with_link = [] | |||
page_list_new_wiki = [] | |||
for page in page_list: | |||
## This also verifies if the page can be accessed. If it cannot, oldContent will be | |||
## equal to pageNotFound and hence nothing is added | |||
oldContent = getContent(page) | |||
wikipediaLink = getWikipediaLink(page) | |||
# If there is a non verified wikipedia link, delete it | |||
if wikipediaHeader in oldContent and wikipediaLink == '': | |||
removeWikipediaContent(page) | |||
# Check if a Wikipedia link has already been added. | |||
if oldContent != pageNotFound and wikipediaHeader not in oldContent: | |||
# If wikipediaLink is the null string then no appropriate link has been found. | |||
if (wikipediaLink != ''): | |||
addWikipediaContent(page, oldContent, wikipediaLink) | |||
page_list_new_wiki.append(page) | |||
else: | |||
page_list_without_wikipedia_link.append(page) | |||
else: | |||
page_list_already_with_link.append(page) | |||
# output data | |||
linksAdded = len(page_list) - len(page_list_without_wikipedia_link) - len(page_list_already_with_link) | |||
print(str(linksAdded) + ' links added to ' + str(len(page_list)) + ' pages.\n') | |||
if __name__ == "__main__": main() | |||
</nowiki> | |||
==Wikipedia== | |||
https://fr.wikipedia.org/wiki/Lintot | |||
Dernière version du 30 mai 2017 à 11:36
Description
- Le LinkBot parcoure les articles de Wikipast et recherche pour chacun d'entre eux un article correspondant sur Wikipedia en français. Pour chaque correspondance trouvée, il ajoute à la fin de l'article Wikipast une section Wikipedia contenant le lien vers la page Wikipedia, uniquement si celle-ci n'est pas encore présente dans la page. Les articles Wikipast parcourus sont ceux créés par les utilisateurs agréés.
- La recherche sur Wikipedia d'une correspondance est faite par un algorithme de l'API de Wikipedia. Celui-ci retourne généralement un unique article. Si plusieurs articles sont retournés, la section Wikipedia n'est pas ajoutée.
Exemples
- Section Wikipedia ajoutée sur la page Munich
- Section Wikipedia ajoutée sur la page Wolfgang Pauli
Performances
La section "Wikipedia" a pu être ajoutée sur tous les articles de datafication biographique de la page Biographies.
Planification
Le bot doit être activé à chaque nouvel article d'un utilisateur agréé, ou à chaque modification du nom d'un article d'un utilisateur agréé. Si cela n'est pas possible, il peut être activé chaque semaine, ce qui semble être une fréquence adaptée au Bot.
Code
# -*- coding: utf-8 -*-
import lxml
import requests
from bs4 import BeautifulSoup
import wikipedia
import re
import sys
# login, settings and format data
user='TotallyNotBot'
passw=XXX
baseurl='http://wikipast.epfl.ch/wikipast/'
summary='LinkBot update'
wikipediaHeader = '==Wikipedia=='
# Used for regex search in removeWikipediaContent
wikipedia.set_lang("fr")
# Constants
pageNotFound = -1
wikipediaLinkNotFound = -2
# - - - - - - -
# Returns all the pages modified and added by the users of the course.
def getPageList():
protected_logins=["Frederickaplan","Maud","Vbuntinx","Testbot","SparqlBot","IB","SourceBot","PageUpdaterBot","Orthobot","BioPathBot","ChronoBOT","Amonbaro","AntoineL","AntoniasBanderos","Arnau","Arnaudpannatier","Aureliver","Brunowicht","Burgerpop","Cedricviaccoz","Christophe","Claudioloureiro","Ghislain","Gregoire3245","Hirtg","Houssm","Icebaker","JenniCin","JiggyQ","JulienB","Kl","Kperrard","Leandro Kieliger","Marcus","Martin","MatteoGiorla","Mireille","Mj2905","Musluoglucem","Nacho","Nameless","Nawel","O'showa","PA","Qantik","QuentinB","Raphael.barman","Roblan11","Romain Fournier","Sbaaa","Snus","Sonia","Tboyer","Thierry","Titi","Vlaedr","Wanda"]
liste_pages=[]
for user in protected_logins:
result=requests.post(baseurl+'api.php?action=query&list=usercontribs&ucuser='+user+'&format=xml')
soup=BeautifulSoup(result.content,'lxml')
for primitive in soup.usercontribs.findAll('item'):
liste_pages.append(primitive['title'])
liste_pages=list(set(liste_pages))
return liste_pages
# get wikipedia link from page
# return format should be a string (e.g. page='Cern' returns 'https://en.wikipedia.org/wiki/CERN')
# if it cannot find the link, it should return the empty string ''.
def getWikipediaLink(page):
try:
page_wiki = wikipedia.page(page)
return page_wiki.url
except:
return ''
# Get content returns the current content of a wiki page. Returns
# pageNotFound if either the page could not be found, or if the content
# is empty.
def getContent(page):
try:
result=requests.post(baseurl+'api.php?action=query&titles='+page+'&export&exportnowrap')
soup=BeautifulSoup(result.text,'lxml')
data=''
for primitive in soup.findAll("text"):
# Check if the article is empty, if it is empty, return the empty string
try:
data+=primitive.string
except:
return data
return data
except:
return pageNotFound
# Adds a new content (newContent) to the page
def addContent(page, newContent):
# Login request
payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'}
r1=requests.post(baseurl + 'api.php', data=payload)
#login confirm
login_token=r1.json()['query']['tokens']['logintoken']
payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token}
r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies)
#get edit token2
params3='?format=json&action=query&meta=tokens&continue='
r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies)
edit_token=r3.json()['query']['tokens']['csrftoken']
edit_cookie=r2.cookies.copy()
edit_cookie.update(r3.cookies)
payload={'action':'edit','assert':'user','format':'json','utf8':'','text':newContent,'summary':summary,'title':page,'token':edit_token}
r4=requests.post(baseurl+'api.php',data=payload,cookies=edit_cookie)
# Add wikipedia content adds the old content with the wikipediaContent appeneded at the end of the page.
def addWikipediaContent(page, oldContent, wikipediaLink):
wikipediaContent = wikipediaHeader + '\n' + wikipediaLink
newContent = oldContent + '\n' + wikipediaContent
addContent(page, newContent)
# Removes the wikipedia content from a page by using regex patterns.
def removeWikipediaContent(page):
oldContent = getContent(page)
if (oldContent == pageNotFound):
return
if re.search(wikipediaHeader, oldContent) is not None:
stringArray = oldContent.split('\n')
newContent = ''
headerFound = False
for line in stringArray:
if not headerFound:
if line == wikipediaHeader:
headerFound = True
else:
newContent += (line + '\n')
else:
headerFound = False
addContent(page, newContent)
# Main program which for every page in getPages(), (i.e. the biographies) if the page has no wikipedia content,
# then it will add the corresponding wikpedia link.
def main():
print('LinkBot starting...')
# load pages ]o add Wikipedia links
page_list = getPageList()
# lists for data (which links could not be found, which have been added already?)
page_list_without_wikipedia_link = []
page_list_already_with_link = []
page_list_new_wiki = []
for page in page_list:
## This also verifies if the page can be accessed. If it cannot, oldContent will be
## equal to pageNotFound and hence nothing is added
oldContent = getContent(page)
wikipediaLink = getWikipediaLink(page)
# If there is a non verified wikipedia link, delete it
if wikipediaHeader in oldContent and wikipediaLink == '':
removeWikipediaContent(page)
# Check if a Wikipedia link has already been added.
if oldContent != pageNotFound and wikipediaHeader not in oldContent:
# If wikipediaLink is the null string then no appropriate link has been found.
if (wikipediaLink != ''):
addWikipediaContent(page, oldContent, wikipediaLink)
page_list_new_wiki.append(page)
else:
page_list_without_wikipedia_link.append(page)
else:
page_list_already_with_link.append(page)
# output data
linksAdded = len(page_list) - len(page_list_without_wikipedia_link) - len(page_list_already_with_link)
print(str(linksAdded) + ' links added to ' + str(len(page_list)) + ' pages.\n')
if __name__ == "__main__": main()