Code OrthoBot

De Wikipast
Version datée du 7 mai 2019 à 07:39 par Assi (discussion | contributions) (Page créée avec « <nowiki> # -*- coding: utf-8 -*- """ Created on Tue Apr 25 15:33:15 2017 @author: bruno """ import time import requests from bs4 import BeautifulSoup letters = ['a', '... »)
(diff) ← Version précédente | Voir la version actuelle (diff) | Version suivante → (diff)
Aller à la navigation Aller à la recherche
<nowiki>
  1. -*- coding: utf-8 -*-

""" Created on Tue Apr 25 15:33:15 2017

@author: bruno """

import time import requests from bs4 import BeautifulSoup

letters = ['a', 'c', 'p', 'e', 'l', 'o', 'n', 't', 'r', 'i', 'f', 'g',

          'j', 'u', "'", 's', 'b', 'd', 'v', '-', 'm', 'z', 'è', 'é', 'q', 'â',
          'y', 'x', 'h', 'k', 'î', 'ê', 'û', 'ç', 'ë', 'ï', 'ô', 'ö', 'à', 'w',
          'ü', 'ñ', 'ù', 'ã', '.']

baseurl='http://wikipast.epfl.ch/wikipast/' user = 'MasterBot' passw = 'dhbot2019' summary='Wikipastbot update'

  1. Login request

payload={'action':'query','format':'json','utf8':,'meta':'tokens','type':'login'} r1=requests.post(baseurl + 'api.php', data=payload)

  1. login confirm

login_token=r1.json()['query']['tokens']['logintoken'] payload={'action':'login','format':'json','utf8':,'lgname':user,'lgpassword':passw,'lgtoken':login_token} r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies)

  1. get edit token2

params3='?format=json&action=query&meta=tokens&continue=' r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies) edit_token=r3.json()['query']['tokens']['csrftoken']

edit_cookie=r2.cookies.copy() edit_cookie.update(r3.cookies)


def read_french_words():

   """read the dictionary txt file and store all the french words in a list"""
   data = requests.get("https://raw.githubusercontent.com/brunowicht/OrthoBot/master/french_words.txt").text # read only 20 000 chars
   data = data.split("\n") # then split it into lines
   
   return data[:-1]

def read_corrections():

   file = open('corrections.txt', 'r', encoding='utf8')
   cor = {}
   w = file.readline()
   while w:
       word = w.split(':')
       cor[word[0]] = word[1].split(',')
       w = file.readline()
   file.close()
   return cor

words = read_french_words() corrections = read_corrections()


def write_corrections(t):

   file = open('corrections.txt', 'a', encoding='utf8')
   file.write(t)
   file.close()
   
   

def word_correct(w):

   """Returns True if word w is correct i.e. if it is in the dictionary"""
   w = w.lower()
   return w in words or w == 

def remove_bracketed(text, keep_hypermot = False):

   """remove all the hypermots and links between brackets"""
   t = 
   bracket = 0
   for c in text:
       if c == '[':
           bracket += 1
       if c == ']' and bracket > 0:
           bracket -= 1
       elif bracket == 0:
           t += c
       elif bracket == 2 and keep_hypermot:
           t += c
   return t

def remove_balised(text):

   """remove all the balises"""
   t = 
   b = 0
   for c in text:
       if c == '<':
           b += 1
       if c == '>' and b > 0:
           b -= 1
       elif b == 0:
           t += c
   return t


def remove_colored(text):

   #color removing
   text = text.replace('', '<')
   text = text.replace('', '<')
   text = text.replace('', '>')
   #no wiki removing (code)
   text = text.replace('<nowiki>', '<')
   text = text.replace('/nowiki>', '>')
   #table removing (a lot of names in there)
   text = text.replace('{|', '<')
   text = text.replace('|}', '>')
   return text
           

def keep_only_letters(text):

   """return the same text removing all character which is not a letter or a space."""
   t = 
   for c in text:
       if c == '’':
           c = "'"
       if c.lower() in letters or c == ' ':
           t += c.lower()
       else:
           t += ' '
   return t+' '

def text_to_words(text):

   """transforms a text into a list of words"""
   word = text.split(' ')
   ws = list()
   for w in word:
       if contains_letter(w):
           ws.append(w)
   return ws

def contains_letter(word):

   for c in word:
       if c.isalpha():
           return True
   return False
           

def eliminate_tirait_apostrophe(word_list):

   word = list()
   for w in word_list:
       if "'" in w:
           if word_correct(w):
               word.append(w)
           else:
               w1 = w.split("'")
               for w2 in w1:
                   if not w2 == :
                       word.append(w2)
       else:
           word.append(w)
   word2 = list()
   for w in word:
       if '.' in w:
           if word_correct(w):
               word2.append(w)
           else:
               w1 = w.split('.')
               for w2 in w1:
                   if not w2 == :
                       word2.append(w2)
       else:
           word2.append(w)
           
   final_word = list()
   for w in word2:
       if "-" in w:
           if word_correct(w):
               word.append(w)
           else:
               w1 = w.split("-")
               for w2 in w1:
                   if not w2 == :
                       final_word.append(w2)
       else:
           final_word.append(w)
   return final_word
               

def text_wrong_words(text, hypermots=False):

   text_words = eliminate_tirait_apostrophe(text_to_words(keep_only_letters(remove_bracketed(remove_balised(remove_colored(text)), hypermots))))
   false_words = list()
   for w in text_words:
       if not word_correct(w):
           false_words.append(w)
   return false_words

def correction_proposition(word):

   """propose a correction for a erroneous word"""
   if word_correct(word):
       return [word]
   cor = list()
   
   ## 1 letter wrong
   for i in range(len(word)):
       for l in letters:
           cword = 
           cword += word[:i]
           cword += l
           if i < len(word)-1:
               cword += word[i+1:]
           if word_correct(cword):
               cor.append(cword)
   
   
   ## 1 letter missing            
   for i in range(len(word)+1):
       for l in letters:
           cword = 
           cword += word[:i]
           cword += l
           if i < len(word):
               cword += word[i:]
           if word_correct(cword):
               cor.append(cword)
               
               
   for i in range(len(word)):
       cword = 
       cword += word[:i]
       if i < len(word)-1:
           cword += word[i+1:]
       if word_correct(cword):
           cor.append(cword)
   


   ##special patterns
   for i in range(len(word)):
       if word[i] == 'f':
           cword = 
           cword += word[:i]
           cword += 'ph'
           if i < len(word)-1:
               cword += word[i+1:]
           if word_correct(cword):
               cor.append(cword)
       
   
   
   
   ## to make each correction unique
   return list(set(cor))

def corrections_for_words(word):

   unique_w = list(set(word))
   for w in unique_w:
       if not w in corrections:
           corrections[w] = correction_proposition(w)
           c = w
           c += ':'
           c += ','.join(corrections[w])
           c += '\n'
           write_corrections(c)
               


def print_correct_proposition_text(text):

   wrong = text_wrong_words(text, False)
   corrections_for_words(wrong)
   for w in wrong:
       print(w + '\ncorrection: ' + str(corrections[w]))
       print()


def get_text(page_name):

   result=requests.post(baseurl+'api.php?action=query&titles='+page_name+'&export&exportnowrap')
   soup=BeautifulSoup(result.text, "lxml")
   text=
   for primitive in soup.findAll("text"):
       try:
           text+=primitive.string
       except:
           continue
   return text

def correct_in_text(text):

   text = text+' '
   wrong = list(set(text_wrong_words(text, False)))
   corrections_for_words(wrong)
   for w in wrong:
       i = text.find(w)
       l = len(w)
       while i in range(0, len(text)):
           if i > 0 and not text[i-1].isalpha() and not text[i+l].isalpha():
               text = text[:i]+ '' +text[i:i + l] + ' (correction(s): ' +', '.join(corrections[w])+')'+ text[i + l:]
           i = text.find(w, i+25)
           
   return text
               

def getPageList():

   protected_logins=["Frederickaplan","Maud","Vbuntinx","Testbot","IB","SourceBot","PageUpdaterBot","Orthobot","BioPathBot","ChronoBOT","Amonbaro","AntoineL","AntoniasBanderos","Arnau","Arnaudpannatier","Aureliver","Brunowicht","Burgerpop","Cedricviaccoz","Christophe","Claudioloureiro","Ghislain","Gregoire3245","Hirtg","Houssm","Icebaker","JenniCin","JiggyQ","JulienB","Kl","Kperrard","Leandro Kieliger","Marcus","Martin","MatteoGiorla","Mireille","Mj2905","Musluoglucem","Nacho","Nameless","Nawel","O'showa","PA","Qantik","QuentinB","Raphael.barman","Roblan11","Romain Fournier","Sbaaa","Snus","Sonia","Tboyer","Thierry","Titi","Vlaedr","Wanda"]
   pages=[]
   for user in protected_logins:
       result=requests.post(baseurl+'api.php?action=query&list=usercontribs&ucuser='+user+'&format=xml&ucend=2017-02-02T16:00:00Z')
       soup=BeautifulSoup(result.content,'lxml')
       for primitive in soup.usercontribs.findAll('item'):
           pages.append(primitive['title'])
   return list(set(pages))
          

def edit_page(page, text):

   payload = {'action':'edit','assert':'user','format':'json','utf8':,'text': text,'summary':summary,'title':page,'token':edit_token}     
   requests.post(baseurl+'api.php',data=payload,cookies=edit_cookie)
   


def main(*names):

   t = time.time()
   pages = ""
   if len(names) == 0:
       pages = getPageList()
   else:
       pages = names
   for p in pages:
       if not (p.find('Fichier') == 0 or p in ['Monsieur Y', 'Madame X', 'Biographies']):
           print(p)
           corrected = correct_in_text(get_text(p))
           #edit_page(p, corrected)
   print(time.time() - t)
  1. Questions à poser:
   #Que faire des hypermots?
   #Comment proposer une correction à l'utilisateur? écrire le mot en rouge 
   #et mettre ses corrections dans la discussion
   #doit on "infecter" le site?
   #texte rouge donne "texte rouge" en rouge dans la sytaxe wiki
   #password orthobot: orthobot2017

<nowiki>