« LinkBot » : différence entre les versions

De Wikipast
Aller à la navigation Aller à la recherche
Ligne 14 : Ligne 14 :


==Code==
==Code==
LinkBot pseudo code


   extractPeopleList()
# -*- coding: utf-8 -*-
    return people biography list
    
import lxml
import requests
from bs4 import BeautifulSoup
import wikipedia
import re
import sys


  getWikipedia(article_name) 
# login, settings and format data
    //query article name in wikipedia
user='TotallyNotBot'
    if exists Wikipedia(article_name)
passw='123456'
      return said article
baseurl='http://wikipast.epfl.ch/wikipast/'
    else return null
summary='LinkBot update'
wikipediaHeader = '==Wikipedia=='
# Used for regex search in removeWikipediaContent
wikipedia.set_lang("fr")


    addWikipedia(article_name, wikipedia_link)
# Constants
      appends to the wikipast of article_name the wikipedia_link according to an agreed upon standard


  deleteWikipediaContent(page)
pageNotFound = -1
      if there is wikipedia content in the page, it will delete it
wikipediaLinkNotFound = -2
 
# - - - - - - - 
 
# Returns all the pages modified and added by the users of the course.
def getPageList():
    protected_logins=["Frederickaplan","Maud","Vbuntinx","Testbot","SparqlBot","IB","SourceBot","PageUpdaterBot","Orthobot","BioPathBot","ChronoBOT","Amonbaro","AntoineL","AntoniasBanderos","Arnau","Arnaudpannatier","Aureliver","Brunowicht","Burgerpop","Cedricviaccoz","Christophe","Claudioloureiro","Ghislain","Gregoire3245","Hirtg","Houssm","Icebaker","JenniCin","JiggyQ","JulienB","Kl","Kperrard","Leandro Kieliger","Marcus","Martin","MatteoGiorla","Mireille","Mj2905","Musluoglucem","Nacho","Nameless","Nawel","O'showa","PA","Qantik","QuentinB","Raphael.barman","Roblan11","Romain Fournier","Sbaaa","Snus","Sonia","Tboyer","Thierry","Titi","Vlaedr","Wanda"]
   
    liste_pages=[]
    for user in protected_logins:
        result=requests.post(baseurl+'api.php?action=query&list=usercontribs&ucuser='+user+'&format=xml')
        soup=BeautifulSoup(result.content,'lxml')
        for primitive in soup.usercontribs.findAll('item'):
            liste_pages.append(primitive['title'])
   
    liste_pages=list(set(liste_pages))
 
    return liste_pages
 
 
# get wikipedia link from page
# return format should be a string (e.g. page='Cern' returns 'https://en.wikipedia.org/wiki/CERN')
# if it cannot find the link, it should return the empty string ''.
def getWikipediaLink(page):
    try:
        page_wiki = wikipedia.page(page)
        return page_wiki.url
 
    except:
        return ''
 
# Get content returns the current content of a wiki page. Returns
# pageNotFound if either the page could not be found, or if the content
# is empty.
def getContent(page):
    try:
        result=requests.post(baseurl+'api.php?action=query&titles='+page+'&export&exportnowrap')
        soup=BeautifulSoup(result.text,'lxml')
        data=''
 
        for primitive in soup.findAll("text"):
            # Check if the article is empty, if it is empty, return the empty string
            try:
                data+=primitive.string
            except:
                return data
       
        return data
 
    except:
        return pageNotFound
 
 
# Adds a new content (newContent) to the page
def addContent(page, newContent):
    # Login request
    payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'}
    r1=requests.post(baseurl + 'api.php', data=payload)
 
    #login confirm
    login_token=r1.json()['query']['tokens']['logintoken']
    payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token}
    r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies)
 
    #get edit token2
    params3='?format=json&action=query&meta=tokens&continue='
    r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies)
    edit_token=r3.json()['query']['tokens']['csrftoken']
 
    edit_cookie=r2.cookies.copy()
    edit_cookie.update(r3.cookies)
 
    payload={'action':'edit','assert':'user','format':'json','utf8':'','text':newContent,'summary':summary,'title':page,'token':edit_token}
    r4=requests.post(baseurl+'api.php',data=payload,cookies=edit_cookie)
 
 
# Add wikipedia content adds the old content with the wikipediaContent appeneded at the end of the page.
def addWikipediaContent(page, oldContent, wikipediaLink):
 
    wikipediaContent = wikipediaHeader + '\n' + wikipediaLink
    newContent = oldContent + '\n' + wikipediaContent
    addContent(page, newContent)
 
 
# Removes the wikipedia content from a page by using regex patterns.
def removeWikipediaContent(page):
   
    oldContent = getContent(page)
 
    if (oldContent == pageNotFound):
        return
 
    if re.search(wikipediaHeader, oldContent) is not None:
 
        stringArray = oldContent.split('\n')
 
        newContent = ''
        headerFound = False
        for line in stringArray:
            if not headerFound:
 
                if line == wikipediaHeader:
                    headerFound = True
                else:
                    newContent += (line + '\n')
            else:
                headerFound = False
 
        addContent(page, newContent)
 
 
# Main program which for every page in getPages(), (i.e. the biographies) if the page has no wikipedia content,
# then it will add the corresponding wikpedia link.
def main():
   
 
    print('LinkBot starting...')
 
    # load pages ]o add Wikipedia links
    page_list = getPageList()
    # lists for data (which links could not be found, which have been added already?)
    page_list_without_wikipedia_link = []
    page_list_already_with_link = []
    page_list_new_wiki = []
 
    for page in page_list:
 
        ## This also verifies if the page can be accessed. If it cannot, oldContent will be
        ## equal to pageNotFound and hence nothing is added
        oldContent = getContent(page)
 
        wikipediaLink = getWikipediaLink(page)
 
        # If there is a non verified wikipedia link, delete it
        if wikipediaHeader in oldContent and wikipediaLink == '':
            removeWikipediaContent(page)
 
        # Check if a Wikipedia link has already been added.
        if oldContent != pageNotFound and wikipediaHeader not in oldContent:
           
            # If wikipediaLink is the null string then no appropriate link has been found.
            if (wikipediaLink != ''):
                addWikipediaContent(page, oldContent, wikipediaLink)
                page_list_new_wiki.append(page)
 
            else:
                page_list_without_wikipedia_link.append(page)
 
        else:
            page_list_already_with_link.append(page)
 
 
    # output data
    linksAdded = len(page_list) - len(page_list_without_wikipedia_link) - len(page_list_already_with_link)
    print(str(linksAdded) + ' links added to ' + str(len(page_list)) + ' pages.\n')
 
if __name__ == "__main__": main()


    main():
      article_list = extract_article_list()
      for article in article_list
        article_wikipedia = getWikipedia(article)
        if (article_wikipedia != null)
          addWikipedia(article, article_wikipedia)
==Wikipedia==
==Wikipedia==
https://fr.wikipedia.org/wiki/Lintot
https://fr.wikipedia.org/wiki/Lintot

Version du 23 mai 2017 à 11:07


Description

  • Le LinkBot parcoure les articles de Wikipast et recherche pour chacun d'entre eux un article correspondant sur Wikipedia en français. Pour chaque correspondance trouvée, il ajoute à la fin de l'article Wikipast une section Wikipedia contenant le lien vers la page Wikipedia, uniquement si celle-ci n'est pas encore présente dans la page. Les articles Wikipast parcourus sont ceux créés par les utilisateurs agréés.
  • La recherche sur Wikipedia d'une correspondance est faite par un algorithme de l'API de Wikipedia. Celui-ci retourne généralement un unique article. Si plusieurs articles sont retournés, la section Wikipedia n'est pas ajoutée.

Exemples

Performances

La section "Wikipedia" a pu être ajoutée sur tous les articles de datafication biographique de la page Biographies.

Planification

Le bot doit être activé à chaque nouvel article d'un utilisateur agréé, ou à chaque modification du nom d'un article d'un utilisateur agréé. Si cela n'est pas possible, il peut être activé chaque semaine, ce qui semble être une fréquence adaptée au Bot.

Code

  1. -*- coding: utf-8 -*-

import lxml import requests from bs4 import BeautifulSoup import wikipedia import re import sys

  1. login, settings and format data

user='TotallyNotBot' passw='123456' baseurl='http://wikipast.epfl.ch/wikipast/' summary='LinkBot update' wikipediaHeader = '==Wikipedia=='

  1. Used for regex search in removeWikipediaContent

wikipedia.set_lang("fr")

  1. Constants

pageNotFound = -1 wikipediaLinkNotFound = -2

  1. - - - - - - -
  1. Returns all the pages modified and added by the users of the course.

def getPageList():

   protected_logins=["Frederickaplan","Maud","Vbuntinx","Testbot","SparqlBot","IB","SourceBot","PageUpdaterBot","Orthobot","BioPathBot","ChronoBOT","Amonbaro","AntoineL","AntoniasBanderos","Arnau","Arnaudpannatier","Aureliver","Brunowicht","Burgerpop","Cedricviaccoz","Christophe","Claudioloureiro","Ghislain","Gregoire3245","Hirtg","Houssm","Icebaker","JenniCin","JiggyQ","JulienB","Kl","Kperrard","Leandro Kieliger","Marcus","Martin","MatteoGiorla","Mireille","Mj2905","Musluoglucem","Nacho","Nameless","Nawel","O'showa","PA","Qantik","QuentinB","Raphael.barman","Roblan11","Romain Fournier","Sbaaa","Snus","Sonia","Tboyer","Thierry","Titi","Vlaedr","Wanda"]
   
   liste_pages=[]
   for user in protected_logins:
       result=requests.post(baseurl+'api.php?action=query&list=usercontribs&ucuser='+user+'&format=xml')
       soup=BeautifulSoup(result.content,'lxml')
       for primitive in soup.usercontribs.findAll('item'):
           liste_pages.append(primitive['title'])
   
   liste_pages=list(set(liste_pages))
   return liste_pages


  1. get wikipedia link from page
  2. return format should be a string (e.g. page='Cern' returns 'https://en.wikipedia.org/wiki/CERN')
  3. if it cannot find the link, it should return the empty string .

def getWikipediaLink(page):

   try:
       page_wiki = wikipedia.page(page)
       return page_wiki.url
   except:
       return 
  1. Get content returns the current content of a wiki page. Returns
  2. pageNotFound if either the page could not be found, or if the content
  3. is empty.

def getContent(page):

   try:
       result=requests.post(baseurl+'api.php?action=query&titles='+page+'&export&exportnowrap') 
       soup=BeautifulSoup(result.text,'lxml')
       data=
       for primitive in soup.findAll("text"):
           # Check if the article is empty, if it is empty, return the empty string
           try:
               data+=primitive.string
           except:
               return data
       
       return data
   except:
       return pageNotFound


  1. Adds a new content (newContent) to the page

def addContent(page, newContent):

   # Login request
   payload={'action':'query','format':'json','utf8':,'meta':'tokens','type':'login'}
   r1=requests.post(baseurl + 'api.php', data=payload)
   #login confirm
   login_token=r1.json()['query']['tokens']['logintoken']
   payload={'action':'login','format':'json','utf8':,'lgname':user,'lgpassword':passw,'lgtoken':login_token}
   r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies)
   #get edit token2
   params3='?format=json&action=query&meta=tokens&continue='
   r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies)
   edit_token=r3.json()['query']['tokens']['csrftoken']
   edit_cookie=r2.cookies.copy()
   edit_cookie.update(r3.cookies)
   payload={'action':'edit','assert':'user','format':'json','utf8':,'text':newContent,'summary':summary,'title':page,'token':edit_token}
   r4=requests.post(baseurl+'api.php',data=payload,cookies=edit_cookie)


  1. Add wikipedia content adds the old content with the wikipediaContent appeneded at the end of the page.

def addWikipediaContent(page, oldContent, wikipediaLink):

   wikipediaContent = wikipediaHeader + '\n' + wikipediaLink
   newContent = oldContent + '\n' + wikipediaContent
   addContent(page, newContent)


  1. Removes the wikipedia content from a page by using regex patterns.

def removeWikipediaContent(page):

   oldContent = getContent(page)
   if (oldContent == pageNotFound):
       return
   if re.search(wikipediaHeader, oldContent) is not None:
       stringArray = oldContent.split('\n')
       newContent = 
       headerFound = False
       for line in stringArray:
           if not headerFound:
               if line == wikipediaHeader:
                   headerFound = True
               else:
                   newContent += (line + '\n')
           else:
               headerFound = False
       addContent(page, newContent)


  1. Main program which for every page in getPages(), (i.e. the biographies) if the page has no wikipedia content,
  2. then it will add the corresponding wikpedia link.

def main():


   print('LinkBot starting...')
   # load pages ]o add Wikipedia links
   page_list = getPageList()
   # lists for data (which links could not be found, which have been added already?)
   page_list_without_wikipedia_link = []
   page_list_already_with_link = []
   page_list_new_wiki = []
   for page in page_list:
       ## This also verifies if the page can be accessed. If it cannot, oldContent will be
       ## equal to pageNotFound and hence nothing is added
       oldContent = getContent(page)
       wikipediaLink = getWikipediaLink(page)
       # If there is a non verified wikipedia link, delete it
       if wikipediaHeader in oldContent and wikipediaLink == :
           removeWikipediaContent(page)
       # Check if a Wikipedia link has already been added.
       if oldContent != pageNotFound and wikipediaHeader not in oldContent:
           
           # If wikipediaLink is the null string then no appropriate link has been found.
           if (wikipediaLink != ):
               addWikipediaContent(page, oldContent, wikipediaLink)
               page_list_new_wiki.append(page)
           else:
               page_list_without_wikipedia_link.append(page)
       else:
           page_list_already_with_link.append(page)


   # output data
   linksAdded = len(page_list) - len(page_list_without_wikipedia_link) - len(page_list_already_with_link)
   print(str(linksAdded) + ' links added to ' + str(len(page_list)) + ' pages.\n')

if __name__ == "__main__": main()

Wikipedia

https://fr.wikipedia.org/wiki/Lintot