« Code OrthoBot » : différence entre les versions
Aller à la navigation
Aller à la recherche
(Page créée avec « <nowiki> # -*- coding: utf-8 -*- """ Created on Tue Apr 25 15:33:15 2017 @author: bruno """ import time import requests from bs4 import BeautifulSoup letters = ['a', '... ») |
Aucun résumé des modifications |
||
| Ligne 336 : | Ligne 336 : | ||
#<span style="color:red">texte rouge</span> donne "texte rouge" en rouge dans la sytaxe wiki | #<span style="color:red">texte rouge</span> donne "texte rouge" en rouge dans la sytaxe wiki | ||
#password orthobot: orthobot2017 | #password orthobot: orthobot2017 | ||
<nowiki> | </nowiki> | ||
Dernière version du 7 mai 2019 à 07:40
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 25 15:33:15 2017
@author: bruno
"""
import time
import requests
from bs4 import BeautifulSoup
letters = ['a', 'c', 'p', 'e', 'l', 'o', 'n', 't', 'r', 'i', 'f', 'g',
'j', 'u', "'", 's', 'b', 'd', 'v', '-', 'm', 'z', 'è', 'é', 'q', 'â',
'y', 'x', 'h', 'k', 'î', 'ê', 'û', 'ç', 'ë', 'ï', 'ô', 'ö', 'à', 'w',
'ü', 'ñ', 'ù', 'ã', '.']
baseurl='http://wikipast.epfl.ch/wikipast/'
user = 'MasterBot'
passw = 'dhbot2019'
summary='Wikipastbot update'
# Login request
payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'}
r1=requests.post(baseurl + 'api.php', data=payload)
#login confirm
login_token=r1.json()['query']['tokens']['logintoken']
payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token}
r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies)
#get edit token2
params3='?format=json&action=query&meta=tokens&continue='
r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies)
edit_token=r3.json()['query']['tokens']['csrftoken']
edit_cookie=r2.cookies.copy()
edit_cookie.update(r3.cookies)
def read_french_words():
"""read the dictionary txt file and store all the french words in a list"""
data = requests.get("https://raw.githubusercontent.com/brunowicht/OrthoBot/master/french_words.txt").text # read only 20 000 chars
data = data.split("\n") # then split it into lines
return data[:-1]
def read_corrections():
file = open('corrections.txt', 'r', encoding='utf8')
cor = {}
w = file.readline()
while w:
word = w.split(':')
cor[word[0]] = word[1].split(',')
w = file.readline()
file.close()
return cor
words = read_french_words()
corrections = read_corrections()
def write_corrections(t):
file = open('corrections.txt', 'a', encoding='utf8')
file.write(t)
file.close()
def word_correct(w):
"""Returns True if word w is correct i.e. if it is in the dictionary"""
w = w.lower()
return w in words or w == ''
def remove_bracketed(text, keep_hypermot = False):
"""remove all the hypermots and links between brackets"""
t = ''
bracket = 0
for c in text:
if c == '[':
bracket += 1
if c == ']' and bracket > 0:
bracket -= 1
elif bracket == 0:
t += c
elif bracket == 2 and keep_hypermot:
t += c
return t
def remove_balised(text):
"""remove all the balises"""
t = ''
b = 0
for c in text:
if c == '<':
b += 1
if c == '>' and b > 0:
b -= 1
elif b == 0:
t += c
return t
def remove_colored(text):
#color removing
text = text.replace('<span style="color:red">', '<')
text = text.replace('<span style="color:green">', '<')
text = text.replace('</span>', '>')
#no wiki removing (code)
text = text.replace('<nowiki>', '<')
text = text.replace('/nowiki>', '>')
#table removing (a lot of names in there)
text = text.replace('{|', '<')
text = text.replace('|}', '>')
return text
def keep_only_letters(text):
"""return the same text removing all character which is not a letter or a space."""
t = ''
for c in text:
if c == '’':
c = "'"
if c.lower() in letters or c == ' ':
t += c.lower()
else:
t += ' '
return t+' '
def text_to_words(text):
"""transforms a text into a list of words"""
word = text.split(' ')
ws = list()
for w in word:
if contains_letter(w):
ws.append(w)
return ws
def contains_letter(word):
for c in word:
if c.isalpha():
return True
return False
def eliminate_tirait_apostrophe(word_list):
word = list()
for w in word_list:
if "'" in w:
if word_correct(w):
word.append(w)
else:
w1 = w.split("'")
for w2 in w1:
if not w2 == '':
word.append(w2)
else:
word.append(w)
word2 = list()
for w in word:
if '.' in w:
if word_correct(w):
word2.append(w)
else:
w1 = w.split('.')
for w2 in w1:
if not w2 == '':
word2.append(w2)
else:
word2.append(w)
final_word = list()
for w in word2:
if "-" in w:
if word_correct(w):
word.append(w)
else:
w1 = w.split("-")
for w2 in w1:
if not w2 == '':
final_word.append(w2)
else:
final_word.append(w)
return final_word
def text_wrong_words(text, hypermots=False):
text_words = eliminate_tirait_apostrophe(text_to_words(keep_only_letters(remove_bracketed(remove_balised(remove_colored(text)), hypermots))))
false_words = list()
for w in text_words:
if not word_correct(w):
false_words.append(w)
return false_words
def correction_proposition(word):
"""propose a correction for a erroneous word"""
if word_correct(word):
return [word]
cor = list()
## 1 letter wrong
for i in range(len(word)):
for l in letters:
cword = ''
cword += word[:i]
cword += l
if i < len(word)-1:
cword += word[i+1:]
if word_correct(cword):
cor.append(cword)
## 1 letter missing
for i in range(len(word)+1):
for l in letters:
cword = ''
cword += word[:i]
cword += l
if i < len(word):
cword += word[i:]
if word_correct(cword):
cor.append(cword)
for i in range(len(word)):
cword = ''
cword += word[:i]
if i < len(word)-1:
cword += word[i+1:]
if word_correct(cword):
cor.append(cword)
##special patterns
for i in range(len(word)):
if word[i] == 'f':
cword = ''
cword += word[:i]
cword += 'ph'
if i < len(word)-1:
cword += word[i+1:]
if word_correct(cword):
cor.append(cword)
## to make each correction unique
return list(set(cor))
def corrections_for_words(word):
unique_w = list(set(word))
for w in unique_w:
if not w in corrections:
corrections[w] = correction_proposition(w)
c = w
c += ':'
c += ','.join(corrections[w])
c += '\n'
write_corrections(c)
def print_correct_proposition_text(text):
wrong = text_wrong_words(text, False)
corrections_for_words(wrong)
for w in wrong:
print(w + '\ncorrection: ' + str(corrections[w]))
print('')
def get_text(page_name):
result=requests.post(baseurl+'api.php?action=query&titles='+page_name+'&export&exportnowrap')
soup=BeautifulSoup(result.text, "lxml")
text=''
for primitive in soup.findAll("text"):
try:
text+=primitive.string
except:
continue
return text
def correct_in_text(text):
text = text+' '
wrong = list(set(text_wrong_words(text, False)))
corrections_for_words(wrong)
for w in wrong:
i = text.find(w)
l = len(w)
while i in range(0, len(text)):
if i > 0 and not text[i-1].isalpha() and not text[i+l].isalpha():
text = text[:i]+ '<span style="color:red">' +text[i:i + l] + '</span> (correction(s): <span style="color:green">' +', '.join(corrections[w])+'</span>)'+ text[i + l:]
i = text.find(w, i+25)
return text
def getPageList():
protected_logins=["Frederickaplan","Maud","Vbuntinx","Testbot","IB","SourceBot","PageUpdaterBot","Orthobot","BioPathBot","ChronoBOT","Amonbaro","AntoineL","AntoniasBanderos","Arnau","Arnaudpannatier","Aureliver","Brunowicht","Burgerpop","Cedricviaccoz","Christophe","Claudioloureiro","Ghislain","Gregoire3245","Hirtg","Houssm","Icebaker","JenniCin","JiggyQ","JulienB","Kl","Kperrard","Leandro Kieliger","Marcus","Martin","MatteoGiorla","Mireille","Mj2905","Musluoglucem","Nacho","Nameless","Nawel","O'showa","PA","Qantik","QuentinB","Raphael.barman","Roblan11","Romain Fournier","Sbaaa","Snus","Sonia","Tboyer","Thierry","Titi","Vlaedr","Wanda"]
pages=[]
for user in protected_logins:
result=requests.post(baseurl+'api.php?action=query&list=usercontribs&ucuser='+user+'&format=xml&ucend=2017-02-02T16:00:00Z')
soup=BeautifulSoup(result.content,'lxml')
for primitive in soup.usercontribs.findAll('item'):
pages.append(primitive['title'])
return list(set(pages))
def edit_page(page, text):
payload = {'action':'edit','assert':'user','format':'json','utf8':'','text': text,'summary':summary,'title':page,'token':edit_token}
requests.post(baseurl+'api.php',data=payload,cookies=edit_cookie)
def main(*names):
t = time.time()
pages = ""
if len(names) == 0:
pages = getPageList()
else:
pages = names
for p in pages:
if not (p.find('Fichier') == 0 or p in ['Monsieur Y', 'Madame X', 'Biographies']):
print(p)
corrected = correct_in_text(get_text(p))
#edit_page(p, corrected)
print(time.time() - t)
#Questions à poser:
#Que faire des hypermots?
#Comment proposer une correction à l'utilisateur? écrire le mot en rouge
#et mettre ses corrections dans la discussion
#doit on "infecter" le site?
#<span style="color:red">texte rouge</span> donne "texte rouge" en rouge dans la sytaxe wiki
#password orthobot: orthobot2017