Code OrthoBot
Aller à la navigation
Aller à la recherche
# -*- coding: utf-8 -*- """ Created on Tue Apr 25 15:33:15 2017 @author: bruno """ import time import requests from bs4 import BeautifulSoup letters = ['a', 'c', 'p', 'e', 'l', 'o', 'n', 't', 'r', 'i', 'f', 'g', 'j', 'u', "'", 's', 'b', 'd', 'v', '-', 'm', 'z', 'è', 'é', 'q', 'â', 'y', 'x', 'h', 'k', 'î', 'ê', 'û', 'ç', 'ë', 'ï', 'ô', 'ö', 'à', 'w', 'ü', 'ñ', 'ù', 'ã', '.'] baseurl='http://wikipast.epfl.ch/wikipast/' user = 'MasterBot' passw = 'dhbot2019' summary='Wikipastbot update' # Login request payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'} r1=requests.post(baseurl + 'api.php', data=payload) #login confirm login_token=r1.json()['query']['tokens']['logintoken'] payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token} r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies) #get edit token2 params3='?format=json&action=query&meta=tokens&continue=' r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies) edit_token=r3.json()['query']['tokens']['csrftoken'] edit_cookie=r2.cookies.copy() edit_cookie.update(r3.cookies) def read_french_words(): """read the dictionary txt file and store all the french words in a list""" data = requests.get("https://raw.githubusercontent.com/brunowicht/OrthoBot/master/french_words.txt").text # read only 20 000 chars data = data.split("\n") # then split it into lines return data[:-1] def read_corrections(): file = open('corrections.txt', 'r', encoding='utf8') cor = {} w = file.readline() while w: word = w.split(':') cor[word[0]] = word[1].split(',') w = file.readline() file.close() return cor words = read_french_words() corrections = read_corrections() def write_corrections(t): file = open('corrections.txt', 'a', encoding='utf8') file.write(t) file.close() def word_correct(w): """Returns True if word w is correct i.e. if it is in the dictionary""" w = w.lower() return w in words or w == '' def remove_bracketed(text, keep_hypermot = False): """remove all the hypermots and links between brackets""" t = '' bracket = 0 for c in text: if c == '[': bracket += 1 if c == ']' and bracket > 0: bracket -= 1 elif bracket == 0: t += c elif bracket == 2 and keep_hypermot: t += c return t def remove_balised(text): """remove all the balises""" t = '' b = 0 for c in text: if c == '<': b += 1 if c == '>' and b > 0: b -= 1 elif b == 0: t += c return t def remove_colored(text): #color removing text = text.replace('<span style="color:red">', '<') text = text.replace('<span style="color:green">', '<') text = text.replace('</span>', '>') #no wiki removing (code) text = text.replace('<nowiki>', '<') text = text.replace('/nowiki>', '>') #table removing (a lot of names in there) text = text.replace('{|', '<') text = text.replace('|}', '>') return text def keep_only_letters(text): """return the same text removing all character which is not a letter or a space.""" t = '' for c in text: if c == '’': c = "'" if c.lower() in letters or c == ' ': t += c.lower() else: t += ' ' return t+' ' def text_to_words(text): """transforms a text into a list of words""" word = text.split(' ') ws = list() for w in word: if contains_letter(w): ws.append(w) return ws def contains_letter(word): for c in word: if c.isalpha(): return True return False def eliminate_tirait_apostrophe(word_list): word = list() for w in word_list: if "'" in w: if word_correct(w): word.append(w) else: w1 = w.split("'") for w2 in w1: if not w2 == '': word.append(w2) else: word.append(w) word2 = list() for w in word: if '.' in w: if word_correct(w): word2.append(w) else: w1 = w.split('.') for w2 in w1: if not w2 == '': word2.append(w2) else: word2.append(w) final_word = list() for w in word2: if "-" in w: if word_correct(w): word.append(w) else: w1 = w.split("-") for w2 in w1: if not w2 == '': final_word.append(w2) else: final_word.append(w) return final_word def text_wrong_words(text, hypermots=False): text_words = eliminate_tirait_apostrophe(text_to_words(keep_only_letters(remove_bracketed(remove_balised(remove_colored(text)), hypermots)))) false_words = list() for w in text_words: if not word_correct(w): false_words.append(w) return false_words def correction_proposition(word): """propose a correction for a erroneous word""" if word_correct(word): return [word] cor = list() ## 1 letter wrong for i in range(len(word)): for l in letters: cword = '' cword += word[:i] cword += l if i < len(word)-1: cword += word[i+1:] if word_correct(cword): cor.append(cword) ## 1 letter missing for i in range(len(word)+1): for l in letters: cword = '' cword += word[:i] cword += l if i < len(word): cword += word[i:] if word_correct(cword): cor.append(cword) for i in range(len(word)): cword = '' cword += word[:i] if i < len(word)-1: cword += word[i+1:] if word_correct(cword): cor.append(cword) ##special patterns for i in range(len(word)): if word[i] == 'f': cword = '' cword += word[:i] cword += 'ph' if i < len(word)-1: cword += word[i+1:] if word_correct(cword): cor.append(cword) ## to make each correction unique return list(set(cor)) def corrections_for_words(word): unique_w = list(set(word)) for w in unique_w: if not w in corrections: corrections[w] = correction_proposition(w) c = w c += ':' c += ','.join(corrections[w]) c += '\n' write_corrections(c) def print_correct_proposition_text(text): wrong = text_wrong_words(text, False) corrections_for_words(wrong) for w in wrong: print(w + '\ncorrection: ' + str(corrections[w])) print('') def get_text(page_name): result=requests.post(baseurl+'api.php?action=query&titles='+page_name+'&export&exportnowrap') soup=BeautifulSoup(result.text, "lxml") text='' for primitive in soup.findAll("text"): try: text+=primitive.string except: continue return text def correct_in_text(text): text = text+' ' wrong = list(set(text_wrong_words(text, False))) corrections_for_words(wrong) for w in wrong: i = text.find(w) l = len(w) while i in range(0, len(text)): if i > 0 and not text[i-1].isalpha() and not text[i+l].isalpha(): text = text[:i]+ '<span style="color:red">' +text[i:i + l] + '</span> (correction(s): <span style="color:green">' +', '.join(corrections[w])+'</span>)'+ text[i + l:] i = text.find(w, i+25) return text def getPageList(): protected_logins=["Frederickaplan","Maud","Vbuntinx","Testbot","IB","SourceBot","PageUpdaterBot","Orthobot","BioPathBot","ChronoBOT","Amonbaro","AntoineL","AntoniasBanderos","Arnau","Arnaudpannatier","Aureliver","Brunowicht","Burgerpop","Cedricviaccoz","Christophe","Claudioloureiro","Ghislain","Gregoire3245","Hirtg","Houssm","Icebaker","JenniCin","JiggyQ","JulienB","Kl","Kperrard","Leandro Kieliger","Marcus","Martin","MatteoGiorla","Mireille","Mj2905","Musluoglucem","Nacho","Nameless","Nawel","O'showa","PA","Qantik","QuentinB","Raphael.barman","Roblan11","Romain Fournier","Sbaaa","Snus","Sonia","Tboyer","Thierry","Titi","Vlaedr","Wanda"] pages=[] for user in protected_logins: result=requests.post(baseurl+'api.php?action=query&list=usercontribs&ucuser='+user+'&format=xml&ucend=2017-02-02T16:00:00Z') soup=BeautifulSoup(result.content,'lxml') for primitive in soup.usercontribs.findAll('item'): pages.append(primitive['title']) return list(set(pages)) def edit_page(page, text): payload = {'action':'edit','assert':'user','format':'json','utf8':'','text': text,'summary':summary,'title':page,'token':edit_token} requests.post(baseurl+'api.php',data=payload,cookies=edit_cookie) def main(*names): t = time.time() pages = "" if len(names) == 0: pages = getPageList() else: pages = names for p in pages: if not (p.find('Fichier') == 0 or p in ['Monsieur Y', 'Madame X', 'Biographies']): print(p) corrected = correct_in_text(get_text(p)) #edit_page(p, corrected) print(time.time() - t) #Questions à poser: #Que faire des hypermots? #Comment proposer une correction à l'utilisateur? écrire le mot en rouge #et mettre ses corrections dans la discussion #doit on "infecter" le site? #<span style="color:red">texte rouge</span> donne "texte rouge" en rouge dans la sytaxe wiki #password orthobot: orthobot2017