Code Triplificator
Aller à la navigation
Aller à la recherche
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
# Création de la base de données
Wikipast = ["Naissance", "Décès", "Retraite", "Élection", "Obsèques", "Participation", "Mort", "Démission",
"Nomination", "Diplôme", "Organisation", "Naturalisation", "Invention", "Rôle", "Meurtre",
"Inauguration", "Rencontre", "Mariage"]
VocabORG = ["birth", "death", "retirement", "investiture", "funeral", "participant", "death", "resignation",
"investiture", "graduation", "organization", "naturalization", "creator", "role", "murder",
"inauguration", "has met", "marriage"]
LibORG = ["bio", "bio", "bio", "bio", "bio", "bio", "bio","bio","bio","bio","bio","bio", "frbr/core",
"participation","bio","bio","relationship", "bio"]
Types = [["bio:principal"], ["bio:principal"], ["bio:principal"], ["bio:principal", "bio:position"], ["bio:principal"],
["bio:principal"], ["bio:principal"], [], ["bio:principal", "bio:position", "bio:organization"],
["bio:principal", "bio:organization"], ["bio:principal", "bio:organization"], [],
["bio:event", "bio:principal"], ["bio:principal", "bio:nationality"], ["bio:work", "bio:principal"],
["bio:principal", "frbr/core:work"], ["bio:principal", "bio:participant"], ["bio:organization"],
["bio:principal"], [], ["bio:principal"], ["bio:partner"], ["bio:principal", "bio:partner"]]
# Construction d'un dictionnaire contenant les données
BDD = {}
for terme in Wikipast:
indice = Wikipast.index(terme)
BDD[terme] = (VocabORG[indice], LibORG[indice], Types[indice])
# Extraction des noms
noms = []
baseurl = 'http://wikipast.epfl.ch/wikipast/'
resultat = requests.post(baseurl+'api.php?action=query&titles='+'Biographies'+'&export&exportnowrap')
soup = BeautifulSoup(resultat.text, "lxml")
code_noms=''
for primitive in soup.findAll("text"):
code_noms += primitive.string
for phrase in code_noms.split("| [["):
nom = phrase.split("]]")
nom = nom[0].strip()
nom = nom.replace(' ','_')
noms.append(nom)
noms = noms [1:]
# Traitement de chaque datafication biographique par le bot
def work(pages):
for nom in pages:
sortie = ""
code = ''
result = requests.post(baseurl + 'api.php?action=query&titles=' + nom + '&export&exportnowrap')
soup = BeautifulSoup(result.text, "lxml")
for primitive in soup.findAll("text"):
code += primitive.string
entrees = code.split("*")
entrees_avec_resultat = 0
for entree in entrees:
lieu = []
date = []
#Trouve la date
date_lieu_action = entree.split("/")
date = date_lieu_action[0]
date = date.replace("[[", "")
date = date.replace("]]", "")
date = ''.join(date.split()) # supprimer tout les whitspaces (tab, retours à la ligne, espace...)
if not(date) or not date[0].isdigit(): #Si le premier élément n'est pas un nombre, ce n'est probablement pas une date
continue
#Trouve le lieu
if len(date_lieu_action) > 1: #s'il y a plus qu'une date
lieu_action = date_lieu_action[1]
lieu_action = lieu_action.replace("[https:","")
lieu_action = lieu_action.replace("[http:", "")
points = lieu_action.count('.')
if lieu_action.endswith('.'):
if points > 1:
lieu_action = lieu_action.split('.')
lieu = lieu_action[0]
lieu = lieu.replace("[[", "")
lieu = lieu.replace("]]", "")
lieu = ''.join(lieu.split()) #supprimer tout les whitspaces (tab, retours à la ligne, espace...)
else:
lieu = '-'
lieu_action = [lieu] + [lieu_action]
else:
if points !=0:
lieu_action = lieu_action.split('.')
lieu = lieu_action[0]
lieu = lieu.replace("[[", "")
lieu = lieu.replace("]]", "")
lieu = ''.join(lieu.split()) # supprimer tout les whitspaces (tab, retours à la ligne, espace...)
else:
lieu = '-'
lieu_action = [lieu] + [lieu_action]
if len(lieu_action) < 2:
continue
split_phase_1 = lieu_action[1].split("[[")
actions = []
for mot in split_phase_1:
action = mot.split("]]")
actions.append(action[0].strip())
if "" in actions:
actions.remove("")
if len(actions) != 0:
for hypermot in BDD:
if hypermot == actions[0]:
entrees_avec_resultat += 1
sortie = sortie + "_:e a " + BDD[hypermot][1] + ":" + BDD[hypermot][0] + "\n ; dc:date '" + date + "'\n ; bio:Place '" + lieu + "'\n"
for type in BDD[hypermot][2]:
indice = BDD[hypermot][2].index(type) + 1
try: actions[indice]
except: continue
sortie = sortie + " ; " + type + " '" + actions[indice] + "' \n"
sortie += "\n"
# Création de la page RDF correspondante
if entrees_avec_resultat > 1:
nouveau_nom = nom.replace('_',' ') + " RDF"
user='MasterBot'
passw='dhbot2019'
summary='Triplificator update'
payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'}
r1=requests.post(baseurl + 'api.php', data=payload)
login_token=r1.json()['query']['tokens']['logintoken']
payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token}
r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies)
params3='?format=json&action=query&meta=tokens&continue='
r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies)
edit_token=r3.json()['query']['tokens']['csrftoken']
edit_cookie=r2.cookies.copy()
edit_cookie.update(r3.cookies)
content= ''
content+='<nowiki>\n'
content+= sortie
content+='\n'
content+='Page originale: '+nom.replace('_',' ')+'\n'
payload={'action':'edit','assert':'user','format':'json','utf8':,'text':content,'summary':summary,'title':nouveau_nom,'token':edit_token}
to_replace = 'Version RDF : '+nouveau_nom+''
code = code.replace(to_replace,' ')
payload2 = {'action': 'edit', 'assert': 'user', 'format': 'json', 'utf8': , 'text': code + '\n' + 'Version RDF : '+nouveau_nom+'', 'summary': summary, 'title': nom, 'token': edit_token}
r4 = requests.post(baseurl + 'api.php', data=payload, cookies=edit_cookie)
r4=requests.post(baseurl+'api.php',data=payload2,cookies=edit_cookie)
def main(*args):
if len(args) == 0:
work(noms)
else:
work(args)