Code Triplificator

De Wikipast
Aller à la navigation Aller à la recherche
# -*- coding: utf-8 -*-

import requests
from bs4 import BeautifulSoup

# Création de la base de données
Wikipast = ["Naissance", "Décès", "Retraite", "Élection", "Obsèques", "Participation", "Mort", "Démission",
                    "Nomination", "Diplôme", "Organisation", "Naturalisation", "Invention", "Rôle", "Meurtre",
                    "Inauguration", "Rencontre", "Mariage"]

VocabORG = ["birth", "death", "retirement", "investiture", "funeral", "participant", "death", "resignation",
                    "investiture", "graduation", "organization", "naturalization", "creator", "role", "murder",
                    "inauguration", "has met", "marriage"]

LibORG = ["bio", "bio", "bio", "bio", "bio", "bio", "bio","bio","bio","bio","bio","bio", "frbr/core",
                  "participation","bio","bio","relationship", "bio"]

Types = [["bio:principal"], ["bio:principal"], ["bio:principal"], ["bio:principal", "bio:position"], ["bio:principal"],
                 ["bio:principal"], ["bio:principal"], [], ["bio:principal", "bio:position", "bio:organization"],
                 ["bio:principal", "bio:organization"], ["bio:principal", "bio:organization"], [],
                 ["bio:event", "bio:principal"], ["bio:principal", "bio:nationality"], ["bio:work", "bio:principal"],
                 ["bio:principal", "frbr/core:work"], ["bio:principal", "bio:participant"], ["bio:organization"],
                 ["bio:principal"], [], ["bio:principal"], ["bio:partner"], ["bio:principal", "bio:partner"]]

# Construction d'un dictionnaire contenant les données
BDD = {}
for terme in Wikipast:
    indice = Wikipast.index(terme)
    BDD[terme] = (VocabORG[indice], LibORG[indice], Types[indice])

# Extraction des noms

noms = []
baseurl = 'http://wikipast.epfl.ch/wikipast/'

resultat = requests.post(baseurl+'api.php?action=query&titles='+'Biographies'+'&export&exportnowrap')
soup = BeautifulSoup(resultat.text, "lxml")
code_noms=''

for primitive in soup.findAll("text"):
    code_noms += primitive.string

for phrase in code_noms.split("| [["):
    nom = phrase.split("]]")
    nom = nom[0].strip()
    nom = nom.replace(' ','_')
    noms.append(nom)

noms = noms [1:]

# Traitement de chaque datafication biographique par le bot

def work(pages):
    for nom in pages:
        sortie = ""
        code = ''
        result = requests.post(baseurl + 'api.php?action=query&titles=' + nom + '&export&exportnowrap')
        soup = BeautifulSoup(result.text, "lxml")

        for primitive in soup.findAll("text"):
            code += primitive.string

        entrees = code.split("*")
        entrees_avec_resultat = 0

        for entree in entrees:

            lieu = []
            date = []

            #Trouve la date
            date_lieu_action = entree.split("/")
            date = date_lieu_action[0]
            date = date.replace("[[", "")
            date = date.replace("]]", "")
            date = ''.join(date.split())  # supprimer tout les whitspaces (tab, retours à la ligne, espace...)
            if not(date) or not date[0].isdigit(): #Si le premier élément n'est pas un nombre, ce n'est probablement pas une date
                continue

            #Trouve le lieu
            if len(date_lieu_action) > 1: #s'il y a plus qu'une date

                lieu_action = date_lieu_action[1]

                lieu_action = lieu_action.replace("[https:","")
                lieu_action = lieu_action.replace("[http:", "")

                points = lieu_action.count('.')

                if lieu_action.endswith('.'):
                    if points > 1:
                        lieu_action = lieu_action.split('.')
                        lieu = lieu_action[0]
                        lieu = lieu.replace("[[", "")
                        lieu = lieu.replace("]]", "")
                        lieu = ''.join(lieu.split()) #supprimer tout les whitspaces (tab, retours à la ligne, espace...)
                    else:
                        lieu = '-'
                        lieu_action = [lieu] + [lieu_action]
                else:
                    if points !=0:
                        lieu_action = lieu_action.split('.')
                        lieu = lieu_action[0]
                        lieu = lieu.replace("[[", "")
                        lieu = lieu.replace("]]", "")
                        lieu = ''.join(lieu.split())  # supprimer tout les whitspaces (tab, retours à la ligne, espace...)
                    else:
                        lieu = '-'
                        lieu_action = [lieu] + [lieu_action]

            if len(lieu_action) < 2:
               continue

            split_phase_1 = lieu_action[1].split("[[")

            actions = []

            for mot in split_phase_1:
                action = mot.split("]]")
                actions.append(action[0].strip())

            if "" in actions:
                actions.remove("")

            if len(actions) != 0:
                for hypermot in BDD:
                    if hypermot == actions[0]:
                        entrees_avec_resultat += 1
                        sortie = sortie + "_:e a " + BDD[hypermot][1] + ":" + BDD[hypermot][0] + "\n   ; dc:date '" + date + "'\n   ; bio:Place '" + lieu + "'\n"

                        for type in BDD[hypermot][2]:
                            indice = BDD[hypermot][2].index(type) + 1
                            try: actions[indice]
                            except: continue
                            sortie = sortie + "   ; " + type + " '" + actions[indice] + "' \n"
                        sortie += "\n"

        # Création de la page RDF correspondante
        if entrees_avec_resultat > 1:

            nouveau_nom = nom.replace('_',' ') + " RDF"

            user='MasterBot'
            passw='dhbot2019'

            summary='Triplificator update'

            payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'}
            r1=requests.post(baseurl + 'api.php', data=payload)

            login_token=r1.json()['query']['tokens']['logintoken']
            payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token}
            r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies)

            params3='?format=json&action=query&meta=tokens&continue='
            r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies)
            edit_token=r3.json()['query']['tokens']['csrftoken']

            edit_cookie=r2.cookies.copy()
            edit_cookie.update(r3.cookies)

            content= ''
            content+='<nowiki>\n'
            content+= sortie
            content+='\n'
            content+='Page originale: '+nom.replace('_',' ')+'\n'
           payload={'action':'edit','assert':'user','format':'json','utf8':,'text':content,'summary':summary,'title':nouveau_nom,'token':edit_token}
           to_replace = 'Version RDF : '+nouveau_nom+''
           code = code.replace(to_replace,' ')
           payload2 = {'action': 'edit', 'assert': 'user', 'format': 'json', 'utf8': , 'text': code + '\n' + 'Version RDF : '+nouveau_nom+'', 'summary': summary, 'title': nom, 'token': edit_token}
           r4 = requests.post(baseurl + 'api.php', data=payload, cookies=edit_cookie)
           r4=requests.post(baseurl+'api.php',data=payload2,cookies=edit_cookie)


def main(*args):
    if len(args) == 0:
        work(noms)
    else:
        work(args)