Code OrthoBot
- -*- coding: utf-8 -*-
""" Created on Tue Apr 25 15:33:15 2017
@author: bruno """
import time import requests from bs4 import BeautifulSoup
letters = ['a', 'c', 'p', 'e', 'l', 'o', 'n', 't', 'r', 'i', 'f', 'g',
'j', 'u', "'", 's', 'b', 'd', 'v', '-', 'm', 'z', 'è', 'é', 'q', 'â', 'y', 'x', 'h', 'k', 'î', 'ê', 'û', 'ç', 'ë', 'ï', 'ô', 'ö', 'à', 'w', 'ü', 'ñ', 'ù', 'ã', '.']
baseurl='' user = 'MasterBot' passw = 'dhbot2019' summary='Wikipastbot update'
- Login request
payload={'action':'query','format':'json','utf8':,'meta':'tokens','type':'login'} + 'api.php', data=payload)
- login confirm
login_token=r1.json()['query']['tokens']['logintoken'] payload={'action':'login','format':'json','utf8':,'lgname':user,'lgpassword':passw,'lgtoken':login_token} + 'api.php', data=payload, cookies=r1.cookies)
- get edit token2
params3='?format=json&action=query&meta=tokens&continue=' r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies) edit_token=r3.json()['query']['tokens']['csrftoken']
edit_cookie=r2.cookies.copy() edit_cookie.update(r3.cookies)
def read_french_words():
"""read the dictionary txt file and store all the french words in a list""" data = requests.get("").text # read only 20 000 chars data = data.split("\n") # then split it into lines return data[:-1]
def read_corrections():
file = open('corrections.txt', 'r', encoding='utf8') cor = {} w = file.readline() while w: word = w.split(':') cor[word[0]] = word[1].split(',') w = file.readline() file.close() return cor
words = read_french_words() corrections = read_corrections()
def write_corrections(t):
file = open('corrections.txt', 'a', encoding='utf8') file.write(t) file.close()
def word_correct(w):
"""Returns True if word w is correct i.e. if it is in the dictionary""" w = w.lower() return w in words or w ==
def remove_bracketed(text, keep_hypermot = False):
"""remove all the hypermots and links between brackets""" t = bracket = 0 for c in text: if c == '[': bracket += 1 if c == ']' and bracket > 0: bracket -= 1 elif bracket == 0: t += c elif bracket == 2 and keep_hypermot: t += c return t
def remove_balised(text):
"""remove all the balises""" t = b = 0 for c in text: if c == '<': b += 1 if c == '>' and b > 0: b -= 1 elif b == 0: t += c return t
def remove_colored(text):
#color removing
text = text.replace('', '<')
text = text.replace('', '<')
text = text.replace('', '>')
#no wiki removing (code)
text = text.replace('<nowiki>', '<')
text = text.replace('/nowiki>', '>')
#table removing (a lot of names in there)
text = text.replace('{|', '<')
text = text.replace('|}', '>')
return text
def keep_only_letters(text):
"""return the same text removing all character which is not a letter or a space.""" t = for c in text: if c == '’': c = "'" if c.lower() in letters or c == ' ': t += c.lower() else: t += ' ' return t+' '
def text_to_words(text):
"""transforms a text into a list of words""" word = text.split(' ') ws = list() for w in word: if contains_letter(w): ws.append(w) return ws
def contains_letter(word):
for c in word: if c.isalpha(): return True return False
def eliminate_tirait_apostrophe(word_list):
word = list() for w in word_list: if "'" in w: if word_correct(w): word.append(w) else: w1 = w.split("'") for w2 in w1: if not w2 == : word.append(w2) else: word.append(w) word2 = list() for w in word: if '.' in w: if word_correct(w): word2.append(w) else: w1 = w.split('.') for w2 in w1: if not w2 == : word2.append(w2) else: word2.append(w) final_word = list() for w in word2: if "-" in w: if word_correct(w): word.append(w) else: w1 = w.split("-") for w2 in w1: if not w2 == : final_word.append(w2) else: final_word.append(w) return final_word
def text_wrong_words(text, hypermots=False):
text_words = eliminate_tirait_apostrophe(text_to_words(keep_only_letters(remove_bracketed(remove_balised(remove_colored(text)), hypermots)))) false_words = list() for w in text_words: if not word_correct(w): false_words.append(w) return false_words
def correction_proposition(word):
"""propose a correction for a erroneous word""" if word_correct(word): return [word] cor = list() ## 1 letter wrong for i in range(len(word)): for l in letters: cword = cword += word[:i] cword += l if i < len(word)-1: cword += word[i+1:] if word_correct(cword): cor.append(cword) ## 1 letter missing for i in range(len(word)+1): for l in letters: cword = cword += word[:i] cword += l if i < len(word): cword += word[i:] if word_correct(cword): cor.append(cword) for i in range(len(word)): cword = cword += word[:i] if i < len(word)-1: cword += word[i+1:] if word_correct(cword): cor.append(cword)
##special patterns for i in range(len(word)): if word[i] == 'f': cword = cword += word[:i] cword += 'ph' if i < len(word)-1: cword += word[i+1:] if word_correct(cword): cor.append(cword) ## to make each correction unique return list(set(cor))
def corrections_for_words(word):
unique_w = list(set(word)) for w in unique_w: if not w in corrections: corrections[w] = correction_proposition(w) c = w c += ':' c += ','.join(corrections[w]) c += '\n' write_corrections(c)
def print_correct_proposition_text(text):
wrong = text_wrong_words(text, False) corrections_for_words(wrong) for w in wrong: print(w + '\ncorrection: ' + str(corrections[w])) print()
def get_text(page_name):'api.php?action=query&titles='+page_name+'&export&exportnowrap') soup=BeautifulSoup(result.text, "lxml") text= for primitive in soup.findAll("text"): try: text+=primitive.string except: continue return text
def correct_in_text(text):
text = text+' ' wrong = list(set(text_wrong_words(text, False))) corrections_for_words(wrong) for w in wrong: i = text.find(w) l = len(w) while i in range(0, len(text)): if i > 0 and not text[i-1].isalpha() and not text[i+l].isalpha(): text = text[:i]+ '' +text[i:i + l] + ' (correction(s): ' +', '.join(corrections[w])+')'+ text[i + l:] i = text.find(w, i+25) return text
def getPageList():
protected_logins=["Frederickaplan","Maud","Vbuntinx","Testbot","IB","SourceBot","PageUpdaterBot","Orthobot","BioPathBot","ChronoBOT","Amonbaro","AntoineL","AntoniasBanderos","Arnau","Arnaudpannatier","Aureliver","Brunowicht","Burgerpop","Cedricviaccoz","Christophe","Claudioloureiro","Ghislain","Gregoire3245","Hirtg","Houssm","Icebaker","JenniCin","JiggyQ","JulienB","Kl","Kperrard","Leandro Kieliger","Marcus","Martin","MatteoGiorla","Mireille","Mj2905","Musluoglucem","Nacho","Nameless","Nawel","O'showa","PA","Qantik","QuentinB","Raphael.barman","Roblan11","Romain Fournier","Sbaaa","Snus","Sonia","Tboyer","Thierry","Titi","Vlaedr","Wanda"] pages=[] for user in protected_logins:'api.php?action=query&list=usercontribs&ucuser='+user+'&format=xml&ucend=2017-02-02T16:00:00Z') soup=BeautifulSoup(result.content,'lxml') for primitive in soup.usercontribs.findAll('item'): pages.append(primitive['title']) return list(set(pages))
def edit_page(page, text):
payload = {'action':'edit','assert':'user','format':'json','utf8':,'text': text,'summary':summary,'title':page,'token':edit_token}'api.php',data=payload,cookies=edit_cookie)
def main(*names):
t = time.time()
pages = "" if len(names) == 0: pages = getPageList() else: pages = names for p in pages: if not (p.find('Fichier') == 0 or p in ['Monsieur Y', 'Madame X', 'Biographies']): print(p) corrected = correct_in_text(get_text(p)) #edit_page(p, corrected) print(time.time() - t)
- Questions à poser:
#Que faire des hypermots?
#Comment proposer une correction à l'utilisateur? écrire le mot en rouge
#et mettre ses corrections dans la discussion
#doit on "infecter" le site?
#texte rouge donne "texte rouge" en rouge dans la sytaxe wiki
#password orthobot: orthobot2017