« LinkBot » : différence entre les versions

De Wikipast
Aller à la navigation Aller à la recherche
(Wikipastbot update)
Ligne 9 : Ligne 9 :
*Section Wikipedia ajoutée sur la page [[Wolfgang Pauli]]
*Section Wikipedia ajoutée sur la page [[Wolfgang Pauli]]
==Performances==
==Performances==
La section "Wikipedia" a pu être ajoutée sur tous les articles de datafication biographique de la page [[Biographies]].
La section "Wikipedia" a pu être ajoutée sur tous les articles de <span style="color:red">datafication</span> (correction(s): <span style="color:green">
</span>) biographique de la page [[Biographies]].
==Planification==
==Planification==
Le bot doit être activé à chaque nouvel article d'un utilisateur agréé, ou à chaque modification du nom d'un article d'un utilisateur agréé. Si cela n'est pas possible, il peut être activé chaque semaine, ce qui semble être une fréquence adaptée au Bot.
Le bot doit être activé à chaque nouvel article d'un utilisateur agréé, ou à chaque modification du nom d'un article d'un utilisateur agréé. Si cela n'est pas possible, il peut être activé chaque semaine, ce qui semble être une fréquence adaptée au Bot.
Ligne 20 : Ligne 21 :
import requests
import requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
import wikipedia
import <span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>)
import re
import re
import sys
import sys
Ligne 27 : Ligne 29 :
user='TotallyNotBot'
user='TotallyNotBot'
passw=XXX
passw=XXX
baseurl='http://wikipast.epfl.ch/wikipast/'
baseurl='http://<span style="color:red">wikipast</span> (correction(s): <span style="color:green">
</span>).epfl.ch/<span style="color:red">wikipast</span> (correction(s): <span style="color:green">
</span>)/'
summary='LinkBot update'
summary='LinkBot update'
wikipediaHeader = '==Wikipedia=='
wikipediaHeader = '==Wikipedia=='
# Used for regex search in removeWikipediaContent
# Used for regex search in removeWikipediaContent
wikipedia.set_lang("fr")
<span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>).set_lang("fr")


# Constants
# Constants
Ligne 56 : Ligne 61 :




# get wikipedia link from page
# get <span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
# return format should be a string (e.g. page='Cern' returns 'https://en.wikipedia.org/wiki/CERN')
</span>) link from page
# return format should be a string (e.g. page='Cern' returns 'https://en.<span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>).org/wiki/CERN')
# if it cannot find the link, it should return the empty string ''.
# if it cannot find the link, it should return the empty string ''.
def getWikipediaLink(page):
def getWikipediaLink(page):
     try:
     try:
         page_wiki = wikipedia.page(page)
         page_wiki = <span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>).page(page)
         return page_wiki.url
         return page_wiki.url


Ligne 112 : Ligne 120 :




# Add wikipedia content adds the old content with the wikipediaContent appeneded at the end of the page.
# Add <span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>) content adds the old content with the wikipediaContent appeneded at the end of the page.
def addWikipediaContent(page, oldContent, wikipediaLink):
def addWikipediaContent(page, oldContent, wikipediaLink):


Ligne 120 : Ligne 129 :




# Removes the wikipedia content from a page by using regex patterns.
# Removes the <span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>) content from a page by using regex patterns.
def removeWikipediaContent(page):
def removeWikipediaContent(page):
      
      
Ligne 147 : Ligne 157 :




# Main program which for every page in getPages(), (i.e. the biographies) if the page has no wikipedia content,
# Main program which for every page in getPages(), (i.e. the biographies) if the page has no <span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>) content,
# then it will add the corresponding wikpedia link.
# then it will add the corresponding wikpedia link.
def main():
def main():
Ligne 157 : Ligne 168 :
     page_list = getPageList()
     page_list = getPageList()
     # lists for data (which links could not be found, which have been added already?)
     # lists for data (which links could not be found, which have been added already?)
     page_list_without_wikipedia_link = []
     page_list_without_<span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>)_link = []
     page_list_already_with_link = []
     page_list_already_with_link = []
     page_list_new_wiki = []
     page_list_new_wiki = []
Ligne 169 : Ligne 181 :
         wikipediaLink = getWikipediaLink(page)
         wikipediaLink = getWikipediaLink(page)


         # If there is a non verified wikipedia link, delete it
         # If there is a non verified <span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>) link, delete it
         if wikipediaHeader in oldContent and wikipediaLink == '':
         if wikipediaHeader in oldContent and wikipediaLink == '':
             removeWikipediaContent(page)
             removeWikipediaContent(page)
Ligne 182 : Ligne 195 :


             else:
             else:
                 page_list_without_wikipedia_link.append(page)
                 page_list_without_<span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>)_link.append(page)


         else:
         else:
Ligne 189 : Ligne 203 :


     # output data
     # output data
     linksAdded = len(page_list) - len(page_list_without_wikipedia_link) - len(page_list_already_with_link)
     linksAdded = len(page_list) - len(page_list_without_<span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>)_link) - len(page_list_already_with_link)
     print(str(linksAdded) + ' links added to ' + str(len(page_list)) + ' pages.\n')
     print(str(linksAdded) + ' links added to ' + str(len(page_list)) + ' pages.\n')


Ligne 196 : Ligne 211 :


==Wikipedia==
==Wikipedia==
https://fr.wikipedia.org/wiki/Lintot
https://fr.<span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>).org/wiki/Lintot

Version du 30 mai 2017 à 08:05


Description

  • Le LinkBot parcoure les articles de Wikipast et recherche pour chacun d'entre eux un article correspondant sur Wikipedia en français. Pour chaque correspondance trouvée, il ajoute à la fin de l'article Wikipast une section Wikipedia contenant le lien vers la page Wikipedia, uniquement si celle-ci n'est pas encore présente dans la page. Les articles Wikipast parcourus sont ceux créés par les utilisateurs agréés.
  • La recherche sur Wikipedia d'une correspondance est faite par un algorithme de l'API de Wikipedia. Celui-ci retourne généralement un unique article. Si plusieurs articles sont retournés, la section Wikipedia n'est pas ajoutée.

Exemples

Performances

La section "Wikipedia" a pu être ajoutée sur tous les articles de datafication (correction(s): ) biographique de la page Biographies.

Planification

Le bot doit être activé à chaque nouvel article d'un utilisateur agréé, ou à chaque modification du nom d'un article d'un utilisateur agréé. Si cela n'est pas possible, il peut être activé chaque semaine, ce qui semble être une fréquence adaptée au Bot.

Code

# -*- coding: utf-8 -*-
   
import lxml
import requests
from bs4 import BeautifulSoup
import <span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>)
import re
import sys

# login, settings and format data
user='TotallyNotBot'
passw=XXX
baseurl='http://<span style="color:red">wikipast</span> (correction(s): <span style="color:green">
</span>).epfl.ch/<span style="color:red">wikipast</span> (correction(s): <span style="color:green">
</span>)/'
summary='LinkBot update'
wikipediaHeader = '==Wikipedia=='
# Used for regex search in removeWikipediaContent
<span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>).set_lang("fr")

# Constants

pageNotFound = -1
wikipediaLinkNotFound = -2

# - - - - - - -  

# Returns all the pages modified and added by the users of the course.
def getPageList():
    protected_logins=["Frederickaplan","Maud","Vbuntinx","Testbot","SparqlBot","IB","SourceBot","PageUpdaterBot","Orthobot","BioPathBot","ChronoBOT","Amonbaro","AntoineL","AntoniasBanderos","Arnau","Arnaudpannatier","Aureliver","Brunowicht","Burgerpop","Cedricviaccoz","Christophe","Claudioloureiro","Ghislain","Gregoire3245","Hirtg","Houssm","Icebaker","JenniCin","JiggyQ","JulienB","Kl","Kperrard","Leandro Kieliger","Marcus","Martin","MatteoGiorla","Mireille","Mj2905","Musluoglucem","Nacho","Nameless","Nawel","O'showa","PA","Qantik","QuentinB","Raphael.barman","Roblan11","Romain Fournier","Sbaaa","Snus","Sonia","Tboyer","Thierry","Titi","Vlaedr","Wanda"]
    
    liste_pages=[]
    for user in protected_logins:
        result=requests.post(baseurl+'api.php?action=query&list=usercontribs&ucuser='+user+'&format=xml')
        soup=BeautifulSoup(result.content,'lxml')
        for primitive in soup.usercontribs.findAll('item'):
            liste_pages.append(primitive['title'])
    
    liste_pages=list(set(liste_pages))

    return liste_pages


# get <span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>) link from page
# return format should be a string (e.g. page='Cern' returns 'https://en.<span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>).org/wiki/CERN')
# if it cannot find the link, it should return the empty string ''.
def getWikipediaLink(page):
    try:
        page_wiki = <span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>).page(page)
        return page_wiki.url

    except:
        return ''

# Get content returns the current content of a wiki page. Returns
# pageNotFound if either the page could not be found, or if the content
# is empty.
def getContent(page):
    try:
        result=requests.post(baseurl+'api.php?action=query&titles='+page+'&export&exportnowrap') 
        soup=BeautifulSoup(result.text,'lxml')
        data=''

        for primitive in soup.findAll("text"):
            # Check if the article is empty, if it is empty, return the empty string
            try:
                data+=primitive.string
            except:
                return data
        
        return data

    except:
        return pageNotFound


# Adds a new content (newContent) to the page
def addContent(page, newContent):
    # Login request
    payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'}
    r1=requests.post(baseurl + 'api.php', data=payload)

    #login confirm
    login_token=r1.json()['query']['tokens']['logintoken']
    payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token}
    r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies)

    #get edit token2
    params3='?format=json&action=query&meta=tokens&continue='
    r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies)
    edit_token=r3.json()['query']['tokens']['csrftoken']

    edit_cookie=r2.cookies.copy()
    edit_cookie.update(r3.cookies)

    payload={'action':'edit','assert':'user','format':'json','utf8':'','text':newContent,'summary':summary,'title':page,'token':edit_token}
    r4=requests.post(baseurl+'api.php',data=payload,cookies=edit_cookie)


# Add <span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>) content adds the old content with the wikipediaContent appeneded at the end of the page.
def addWikipediaContent(page, oldContent, wikipediaLink):

    wikipediaContent = wikipediaHeader + '\n' + wikipediaLink
    newContent = oldContent + '\n' + wikipediaContent
    addContent(page, newContent)


# Removes the <span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>) content from a page by using regex patterns.
def removeWikipediaContent(page):
    
    oldContent = getContent(page)

    if (oldContent == pageNotFound):
        return

    if re.search(wikipediaHeader, oldContent) is not None:

        stringArray = oldContent.split('\n')

        newContent = ''
        headerFound = False
        for line in stringArray:
            if not headerFound:

                if line == wikipediaHeader:
                    headerFound = True
                else:
                    newContent += (line + '\n')
            else:
                headerFound = False

        addContent(page, newContent)


# Main program which for every page in getPages(), (i.e. the biographies) if the page has no <span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>) content,
# then it will add the corresponding wikpedia link.
def main():
    

    print('LinkBot starting...')

    # load pages ]o add Wikipedia links
    page_list = getPageList()
    # lists for data (which links could not be found, which have been added already?)
    page_list_without_<span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>)_link = []
    page_list_already_with_link = []
    page_list_new_wiki = []

    for page in page_list:

        ## This also verifies if the page can be accessed. If it cannot, oldContent will be
        ## equal to pageNotFound and hence nothing is added
        oldContent = getContent(page)

        wikipediaLink = getWikipediaLink(page)

        # If there is a non verified <span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>) link, delete it
        if wikipediaHeader in oldContent and wikipediaLink == '':
            removeWikipediaContent(page)

        # Check if a Wikipedia link has already been added.
        if oldContent != pageNotFound and wikipediaHeader not in oldContent:
            
            # If wikipediaLink is the null string then no appropriate link has been found.
            if (wikipediaLink != ''):
                addWikipediaContent(page, oldContent, wikipediaLink)
                page_list_new_wiki.append(page)

            else:
                page_list_without_<span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>)_link.append(page)

        else:
            page_list_already_with_link.append(page)


    # output data
    linksAdded = len(page_list) - len(page_list_without_<span style="color:red">wikipedia</span> (correction(s): <span style="color:green">
</span>)_link) - len(page_list_already_with_link)
    print(str(linksAdded) + ' links added to ' + str(len(page_list)) + ' pages.\n')

if __name__ == "__main__": main()

Wikipedia

https://fr.wikipedia (correction(s): ).org/wiki/Lintot