« LinkBot » : différence entre les versions
Aller à la navigation
Aller à la recherche
(→Code) |
(Wikipastbot update) |
||
Ligne 9 : | Ligne 9 : | ||
*Section Wikipedia ajoutée sur la page [[Wolfgang Pauli]] | *Section Wikipedia ajoutée sur la page [[Wolfgang Pauli]] | ||
==Performances== | ==Performances== | ||
La section "Wikipedia" a pu être ajoutée sur tous les articles de datafication biographique de la page [[Biographies]]. | La section "Wikipedia" a pu être ajoutée sur tous les articles de <span style="color:red">datafication</span> (correction(s): <span style="color:green"> | ||
</span>) biographique de la page [[Biographies]]. | |||
==Planification== | ==Planification== | ||
Le bot doit être activé à chaque nouvel article d'un utilisateur agréé, ou à chaque modification du nom d'un article d'un utilisateur agréé. Si cela n'est pas possible, il peut être activé chaque semaine, ce qui semble être une fréquence adaptée au Bot. | Le bot doit être activé à chaque nouvel article d'un utilisateur agréé, ou à chaque modification du nom d'un article d'un utilisateur agréé. Si cela n'est pas possible, il peut être activé chaque semaine, ce qui semble être une fréquence adaptée au Bot. | ||
Ligne 20 : | Ligne 21 : | ||
import requests | import requests | ||
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||
import wikipedia | import <span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> | ||
</span>) | |||
import re | import re | ||
import sys | import sys | ||
Ligne 27 : | Ligne 29 : | ||
user='TotallyNotBot' | user='TotallyNotBot' | ||
passw=XXX | passw=XXX | ||
baseurl='http://wikipast.epfl.ch/wikipast/' | baseurl='http://<span style="color:red">wikipast</span> (correction(s): <span style="color:green"> | ||
</span>).epfl.ch/<span style="color:red">wikipast</span> (correction(s): <span style="color:green"> | |||
</span>)/' | |||
summary='LinkBot update' | summary='LinkBot update' | ||
wikipediaHeader = '==Wikipedia==' | wikipediaHeader = '==Wikipedia==' | ||
# Used for regex search in removeWikipediaContent | # Used for regex search in removeWikipediaContent | ||
wikipedia.set_lang("fr") | <span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> | ||
</span>).set_lang("fr") | |||
# Constants | # Constants | ||
Ligne 56 : | Ligne 61 : | ||
# get wikipedia link from page | # get <span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> | ||
# return format should be a string (e.g. page='Cern' returns 'https://en.wikipedia.org/wiki/CERN') | </span>) link from page | ||
# return format should be a string (e.g. page='Cern' returns 'https://en.<span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> | |||
</span>).org/wiki/CERN') | |||
# if it cannot find the link, it should return the empty string ''. | # if it cannot find the link, it should return the empty string ''. | ||
def getWikipediaLink(page): | def getWikipediaLink(page): | ||
try: | try: | ||
page_wiki = wikipedia.page(page) | page_wiki = <span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> | ||
</span>).page(page) | |||
return page_wiki.url | return page_wiki.url | ||
Ligne 112 : | Ligne 120 : | ||
# Add wikipedia content adds the old content with the wikipediaContent appeneded at the end of the page. | # Add <span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> | ||
</span>) content adds the old content with the wikipediaContent appeneded at the end of the page. | |||
def addWikipediaContent(page, oldContent, wikipediaLink): | def addWikipediaContent(page, oldContent, wikipediaLink): | ||
Ligne 120 : | Ligne 129 : | ||
# Removes the wikipedia content from a page by using regex patterns. | # Removes the <span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> | ||
</span>) content from a page by using regex patterns. | |||
def removeWikipediaContent(page): | def removeWikipediaContent(page): | ||
Ligne 147 : | Ligne 157 : | ||
# Main program which for every page in getPages(), (i.e. the biographies) if the page has no wikipedia content, | # Main program which for every page in getPages(), (i.e. the biographies) if the page has no <span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> | ||
</span>) content, | |||
# then it will add the corresponding wikpedia link. | # then it will add the corresponding wikpedia link. | ||
def main(): | def main(): | ||
Ligne 157 : | Ligne 168 : | ||
page_list = getPageList() | page_list = getPageList() | ||
# lists for data (which links could not be found, which have been added already?) | # lists for data (which links could not be found, which have been added already?) | ||
page_list_without_<span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> | |||
</span>)_link = [] | |||
page_list_already_with_link = [] | page_list_already_with_link = [] | ||
page_list_new_wiki = [] | page_list_new_wiki = [] | ||
Ligne 169 : | Ligne 181 : | ||
wikipediaLink = getWikipediaLink(page) | wikipediaLink = getWikipediaLink(page) | ||
# If there is a non verified wikipedia link, delete it | # If there is a non verified <span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> | ||
</span>) link, delete it | |||
if wikipediaHeader in oldContent and wikipediaLink == '': | if wikipediaHeader in oldContent and wikipediaLink == '': | ||
removeWikipediaContent(page) | removeWikipediaContent(page) | ||
Ligne 182 : | Ligne 195 : | ||
else: | else: | ||
page_list_without_<span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> | |||
</span>)_link.append(page) | |||
else: | else: | ||
Ligne 189 : | Ligne 203 : | ||
# output data | # output data | ||
linksAdded = len(page_list) - len( | linksAdded = len(page_list) - len(page_list_without_<span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> | ||
</span>)_link) - len(page_list_already_with_link) | |||
print(str(linksAdded) + ' links added to ' + str(len(page_list)) + ' pages.\n') | print(str(linksAdded) + ' links added to ' + str(len(page_list)) + ' pages.\n') | ||
Ligne 196 : | Ligne 211 : | ||
==Wikipedia== | ==Wikipedia== | ||
https://fr.wikipedia.org/wiki/Lintot | https://fr.<span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> | ||
</span>).org/wiki/Lintot |
Version du 30 mai 2017 à 08:05
Description
- Le LinkBot parcoure les articles de Wikipast et recherche pour chacun d'entre eux un article correspondant sur Wikipedia en français. Pour chaque correspondance trouvée, il ajoute à la fin de l'article Wikipast une section Wikipedia contenant le lien vers la page Wikipedia, uniquement si celle-ci n'est pas encore présente dans la page. Les articles Wikipast parcourus sont ceux créés par les utilisateurs agréés.
- La recherche sur Wikipedia d'une correspondance est faite par un algorithme de l'API de Wikipedia. Celui-ci retourne généralement un unique article. Si plusieurs articles sont retournés, la section Wikipedia n'est pas ajoutée.
Exemples
- Section Wikipedia ajoutée sur la page Munich
- Section Wikipedia ajoutée sur la page Wolfgang Pauli
Performances
La section "Wikipedia" a pu être ajoutée sur tous les articles de datafication (correction(s): ) biographique de la page Biographies.
Planification
Le bot doit être activé à chaque nouvel article d'un utilisateur agréé, ou à chaque modification du nom d'un article d'un utilisateur agréé. Si cela n'est pas possible, il peut être activé chaque semaine, ce qui semble être une fréquence adaptée au Bot.
Code
# -*- coding: utf-8 -*- import lxml import requests from bs4 import BeautifulSoup import <span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> </span>) import re import sys # login, settings and format data user='TotallyNotBot' passw=XXX baseurl='http://<span style="color:red">wikipast</span> (correction(s): <span style="color:green"> </span>).epfl.ch/<span style="color:red">wikipast</span> (correction(s): <span style="color:green"> </span>)/' summary='LinkBot update' wikipediaHeader = '==Wikipedia==' # Used for regex search in removeWikipediaContent <span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> </span>).set_lang("fr") # Constants pageNotFound = -1 wikipediaLinkNotFound = -2 # - - - - - - - # Returns all the pages modified and added by the users of the course. def getPageList(): protected_logins=["Frederickaplan","Maud","Vbuntinx","Testbot","SparqlBot","IB","SourceBot","PageUpdaterBot","Orthobot","BioPathBot","ChronoBOT","Amonbaro","AntoineL","AntoniasBanderos","Arnau","Arnaudpannatier","Aureliver","Brunowicht","Burgerpop","Cedricviaccoz","Christophe","Claudioloureiro","Ghislain","Gregoire3245","Hirtg","Houssm","Icebaker","JenniCin","JiggyQ","JulienB","Kl","Kperrard","Leandro Kieliger","Marcus","Martin","MatteoGiorla","Mireille","Mj2905","Musluoglucem","Nacho","Nameless","Nawel","O'showa","PA","Qantik","QuentinB","Raphael.barman","Roblan11","Romain Fournier","Sbaaa","Snus","Sonia","Tboyer","Thierry","Titi","Vlaedr","Wanda"] liste_pages=[] for user in protected_logins: result=requests.post(baseurl+'api.php?action=query&list=usercontribs&ucuser='+user+'&format=xml') soup=BeautifulSoup(result.content,'lxml') for primitive in soup.usercontribs.findAll('item'): liste_pages.append(primitive['title']) liste_pages=list(set(liste_pages)) return liste_pages # get <span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> </span>) link from page # return format should be a string (e.g. page='Cern' returns 'https://en.<span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> </span>).org/wiki/CERN') # if it cannot find the link, it should return the empty string ''. def getWikipediaLink(page): try: page_wiki = <span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> </span>).page(page) return page_wiki.url except: return '' # Get content returns the current content of a wiki page. Returns # pageNotFound if either the page could not be found, or if the content # is empty. def getContent(page): try: result=requests.post(baseurl+'api.php?action=query&titles='+page+'&export&exportnowrap') soup=BeautifulSoup(result.text,'lxml') data='' for primitive in soup.findAll("text"): # Check if the article is empty, if it is empty, return the empty string try: data+=primitive.string except: return data return data except: return pageNotFound # Adds a new content (newContent) to the page def addContent(page, newContent): # Login request payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'} r1=requests.post(baseurl + 'api.php', data=payload) #login confirm login_token=r1.json()['query']['tokens']['logintoken'] payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token} r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies) #get edit token2 params3='?format=json&action=query&meta=tokens&continue=' r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies) edit_token=r3.json()['query']['tokens']['csrftoken'] edit_cookie=r2.cookies.copy() edit_cookie.update(r3.cookies) payload={'action':'edit','assert':'user','format':'json','utf8':'','text':newContent,'summary':summary,'title':page,'token':edit_token} r4=requests.post(baseurl+'api.php',data=payload,cookies=edit_cookie) # Add <span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> </span>) content adds the old content with the wikipediaContent appeneded at the end of the page. def addWikipediaContent(page, oldContent, wikipediaLink): wikipediaContent = wikipediaHeader + '\n' + wikipediaLink newContent = oldContent + '\n' + wikipediaContent addContent(page, newContent) # Removes the <span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> </span>) content from a page by using regex patterns. def removeWikipediaContent(page): oldContent = getContent(page) if (oldContent == pageNotFound): return if re.search(wikipediaHeader, oldContent) is not None: stringArray = oldContent.split('\n') newContent = '' headerFound = False for line in stringArray: if not headerFound: if line == wikipediaHeader: headerFound = True else: newContent += (line + '\n') else: headerFound = False addContent(page, newContent) # Main program which for every page in getPages(), (i.e. the biographies) if the page has no <span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> </span>) content, # then it will add the corresponding wikpedia link. def main(): print('LinkBot starting...') # load pages ]o add Wikipedia links page_list = getPageList() # lists for data (which links could not be found, which have been added already?) page_list_without_<span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> </span>)_link = [] page_list_already_with_link = [] page_list_new_wiki = [] for page in page_list: ## This also verifies if the page can be accessed. If it cannot, oldContent will be ## equal to pageNotFound and hence nothing is added oldContent = getContent(page) wikipediaLink = getWikipediaLink(page) # If there is a non verified <span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> </span>) link, delete it if wikipediaHeader in oldContent and wikipediaLink == '': removeWikipediaContent(page) # Check if a Wikipedia link has already been added. if oldContent != pageNotFound and wikipediaHeader not in oldContent: # If wikipediaLink is the null string then no appropriate link has been found. if (wikipediaLink != ''): addWikipediaContent(page, oldContent, wikipediaLink) page_list_new_wiki.append(page) else: page_list_without_<span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> </span>)_link.append(page) else: page_list_already_with_link.append(page) # output data linksAdded = len(page_list) - len(page_list_without_<span style="color:red">wikipedia</span> (correction(s): <span style="color:green"> </span>)_link) - len(page_list_already_with_link) print(str(linksAdded) + ' links added to ' + str(len(page_list)) + ' pages.\n') if __name__ == "__main__": main()
Wikipedia
https://fr.wikipedia (correction(s): ).org/wiki/Lintot