« Code TranslatorBot » : différence entre les versions

De Wikipast
Aller à la navigation Aller à la recherche
Aucun résumé des modifications
Aucun résumé des modifications
 
(Une version intermédiaire par le même utilisateur non affichée)
Ligne 11 : Ligne 11 :
     baseurl='http://wikipast.epfl.ch/wikipast/'
     baseurl='http://wikipast.epfl.ch/wikipast/'
     summary='Wikipastbot update'
     summary='Wikipastbot update'
    translator = Translator()


     # this parameter is the target language in which we want to translate
     # this parameter is the target language in which we want to translate
Ligne 41 : Ligne 40 :
     # we fetch the text we want to translate
     # we fetch the text we want to translate
     for name in pages:
     for name in pages:
        translator = Translator()
         result=requests.post(baseurl+'api.php?action=query&titles='+name+'&export&exportnowrap')
         result=requests.post(baseurl+'api.php?action=query&titles='+name+'&export&exportnowrap')
         soup=BeautifulSoup(result.text, "lxml")
         soup=BeautifulSoup(result.text, "lxml")
Ligne 46 : Ligne 46 :
         for primitive in soup.findAll("text"):
         for primitive in soup.findAll("text"):
             code += primitive.string
             code += primitive.string
        print(code)


         # create names with english prefix
         # create names with english prefix
Ligne 52 : Ligne 51 :


         # add a table in the french page if it still not exists
         # add a table in the french page if it still not exists
         if(code != '' and code[0] != '{'and code[0] != '|') :      
         if ("|Langue" not in code) and ("|Language" not in code) : 
            result=requests.post(baseurl+'api.php?action=query&titles='+en_name+'&export&exportnowrap')
            soup=BeautifulSoup(result.text, "lxml")
            res=''
            for primitive in soup.findAll("text"):
                res+=primitive.string
            if (res == ''):


            code2 = '''{| class="wikitable"\n|Langue \n|''' + "'''Français'''\n|[[" + en_name + "|" + target_language + "]]\n|}\n" + code
                code2 = '''{| class="wikitable"\n|Langue \n|''' + "'''Français'''\n|[[" + en_name + "|" + target_language + "]]\n|}\n" + code
            payload2={'action':'edit','assert':'user','format':'json','utf8':'','text':code2,'summary':summary,'title':name,'token':edit_token}
                payload2={'action':'edit','assert':'user','format':'json','utf8':'','text':code2,'summary':summary,'title':name,'token':edit_token}
            r5=requests.post(baseurl+'api.php',data=payload2,cookies=edit_cookie)
                r5=requests.post(baseurl+'api.php',data=payload2,cookies=edit_cookie)


            # save the links of sources that we won't translate
                # save the links of sources that we won't translate
            sources = []
                sources = []
            i=0
                i=0
            while i< len(code):
                while i< len(code):
                if (code[i-1] != '[' and code[i] == '[' and code[i+1] != '[') :
                    if (code[i-1] != '[' and code[i] == '[' and code[i+1] != '[') :
                    j = i+2
                        j = i+2
                    while(code[j] != ']') :
                        while(code[j] != ']') :
                        j += 1
                            j += 1
                    sources.append(code[i:j+1])         
                        sources.append(code[i:j+1])         
                    code = code.replace(code[i:j+1], "&&&", 1)
                        code = code.replace(code[i:j+1], "&&&", 1)
                    i = j+1
                        i = j+1
                else:
                    else:
                    i += 1
                        i += 1


            # translate the whole text by chunk of approx. 5000 characters.
                # translate the whole text by chunk of approx. 5000 characters.
            length = len(code)
                length = len(code)
            chaine =''
                chaine =''
            punto = '.'
                punto = '.'
            k = 0
                k = 0
            diminution = 1
                diminution = 1
            last = k+5000
                last = k+5000
            while last < length:         
                while last < length:         
                if code[last] == punto:
                    if code[last] == punto:
                    chaine += translator.translate(code[k:last], src = 'fr', dest= target_lang).text
                        chaine += translator.translate(code[k:last], src = 'fr', dest= target_lang).text
                    k = last
                        k = last
                else:
                    else:
                    while code[k+5000-diminution] != punto:
                        while code[k+5000-diminution] != punto:
                        diminution += 1
                            diminution += 1
                    chaine += translator.translate(code[k:k+5000-diminution],src = 'fr', dest= target_lang).text
                        chaine += translator.translate(code[k:k+5000-diminution],src = 'fr', dest= target_lang).text
                    k = k+5000-diminution+1
                        k = k+5000-diminution+1
                    diminution = 1
                        diminution = 1
                last += 5000         
                    last += 5000         
            last -= 5000
                last -= 5000
            chaine += translator.translate(code[last:length],src='fr',dest= target_lang).text
                chaine += translator.translate(code[last:length],src='fr',dest= target_lang).text


            translated_text = chaine
                translated_text = chaine


            # make the hyperlinks point to the correct page while hiding the (en)
                # make the hyperlinks point to the correct page while hiding the (en)
            for i in range(len(translated_text)):
                for i in range(len(translated_text)):
                if (translated_text[i] == '[' and translated_text[i+1] == '[' and translated_text[i+2].isalpha()) :
                    if (translated_text[i] == '[' and translated_text[i+1] == '[' and translated_text[i+2].isalpha()) :
                    j = i
                        j = i
                    while(translated_text[j] != ']') :
                        while(translated_text[j] != ']') :
                        j += 1
                            j += 1
                    m = translated_text[i+2:j]
                        m = translated_text[i+2:j]
                    linkM = "[[" + m + "]]"
                        linkM = "[[" + m + "]]"
                    translated_text = translated_text.replace(linkM, "[[(" + target_lang + ")_" + m + '|' + m + "]]")
                        translated_text = translated_text.replace(linkM, "[[(" + target_lang + ")_" + m + '|' + m + "]]")


            # replace the translates sources by the original ones
                # replace the translates sources by the original ones
            M_final_text = translated_text.split("&&&")
                M_final_text = translated_text.split("&&&")
            final_text =""
                final_text =""
            for i in range(len(sources)):
                for i in range(len(sources)):
                final_text += M_final_text[i]
                    final_text += M_final_text[i]
                final_text += sources[i]
                    final_text += sources[i]
            translated_text = final_text
                translated_text = final_text


            # avoid problems due to the comments of other bots
                if "(en)_File" in translated_text:
            translated_text = translated_text.replace('->', '-->').replace('<! -', '<!--').replace('</ ', '</')
                    translated_text = translated_text.replace("(en)_File", "Fichier")


            # add the table
                # avoid problems due to the comments of other bots
            translated_text = '''{| class="wikitable"\n|Language \n|[[''' + name + "|Français]]\n|'''" + target_language + "'''\n|}\n" + translated_text
                translated_text = translated_text.replace('->', '-->').replace('<! -', '<!--').replace('</ ', '</')


        # write on the page
                # add the table
            payload={'action':'edit','assert':'user','format':'json','utf8':'','text':translated_text,'summary':summary,'title':en_name,'token':edit_token}
                translated_text = '''{| class="wikitable"\n|Language \n|[[''' + name + "|Français]]\n|'''" + target_language + "'''\n|}\n" + translated_text
            r4=requests.post(baseurl+'api.php',data=payload,cookies=edit_cookie)
 
            # write on the page
                payload={'action':'edit','assert':'user','format':'json','utf8':'','text':translated_text,'summary':summary,'title':en_name,'token':edit_token}
                r4=requests.post(baseurl+'api.php',data=payload,cookies=edit_cookie)





Dernière version du 8 mai 2019 à 18:47

import requests
import re
from bs4 import BeautifulSoup
from googletrans import Translator

# the function takes a table of strings as argument containing the names of the pages to translate 
def main(*args):
    user='MasterBot'
    passw='dhbot2019'
    baseurl='http://wikipast.epfl.ch/wikipast/'
    summary='Wikipastbot update'

    # this parameter is the target language in which we want to translate
    target_lang = 'en'
    target_language = 'English'

    # login request
    payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'}
    r1=requests.post(baseurl + 'api.php', data=payload)

    # login confirm
    login_token=r1.json()['query']['tokens']['logintoken']
    payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token}
    r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies)

    # get edit token2
    params3='?format=json&action=query&meta=tokens&continue='
    r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies)
    edit_token=r3.json()['query']['tokens']['csrftoken']

    edit_cookie=r2.cookies.copy()
    edit_cookie.update(r3.cookies)

    if len(args) == 0:
        pages = scrape_datafications()
    else:
        pages = args

    # we fetch the text we want to translate
    for name in pages:
        translator = Translator()
        result=requests.post(baseurl+'api.php?action=query&titles='+name+'&export&exportnowrap')
        soup=BeautifulSoup(result.text, "lxml")
        code=''
        for primitive in soup.findAll("text"):
            code += primitive.string

        # create names with english prefix
        en_name = "(" + target_lang + ")_" + translator.translate(name, src='fr', dest=target_lang).text

        # add a table in the french page if it still not exists
        if ("|Langue" not in code) and ("|Language" not in code) :  
            result=requests.post(baseurl+'api.php?action=query&titles='+en_name+'&export&exportnowrap')
            soup=BeautifulSoup(result.text, "lxml")
            res=''
            for primitive in soup.findAll("text"):
                res+=primitive.string
            if (res == ''):

                code2 = '''{| class="wikitable"\n|Langue \n|''' + "'''Français'''\n|[[" + en_name + "|" + target_language + "]]\n|}\n" + code
                payload2={'action':'edit','assert':'user','format':'json','utf8':'','text':code2,'summary':summary,'title':name,'token':edit_token}
                r5=requests.post(baseurl+'api.php',data=payload2,cookies=edit_cookie)

                # save the links of sources that we won't translate
                sources = []
                i=0
                while i< len(code):
                    if (code[i-1] != '[' and code[i] == '[' and code[i+1] != '[') :
                        j = i+2
                        while(code[j] != ']') :
                            j += 1
                        sources.append(code[i:j+1])        
                        code = code.replace(code[i:j+1], "&&&", 1)
                        i = j+1
                    else:
                        i += 1

                # translate the whole text by chunk of approx. 5000 characters.
                length = len(code)
                chaine =''
                punto = '.'
                k = 0
                diminution = 1
                last = k+5000
                while last < length:        
                    if code[last] == punto:
                        chaine += translator.translate(code[k:last], src = 'fr', dest= target_lang).text
                        k = last
                    else:
                        while code[k+5000-diminution] != punto:
                            diminution += 1
                        chaine += translator.translate(code[k:k+5000-diminution],src = 'fr', dest= target_lang).text
                        k = k+5000-diminution+1
                        diminution = 1
                    last += 5000        
                last -= 5000
                chaine += translator.translate(code[last:length],src='fr',dest= target_lang).text

                translated_text = chaine

                # make the hyperlinks point to the correct page while hiding the (en)
                for i in range(len(translated_text)):
                    if (translated_text[i] == '[' and translated_text[i+1] == '[' and translated_text[i+2].isalpha()) :
                        j = i
                        while(translated_text[j] != ']') :
                            j += 1
                        m = translated_text[i+2:j]
                        linkM = "[[" + m + "]]"
                        translated_text = translated_text.replace(linkM, "[[(" + target_lang + ")_" + m + '|' + m + "]]")

                # replace the translates sources by the original ones
                M_final_text = translated_text.split("&&&")
                final_text =""
                for i in range(len(sources)):
                    final_text += M_final_text[i]
                    final_text += sources[i]
                translated_text = final_text

                if "(en)_File" in translated_text:
                    translated_text = translated_text.replace("(en)_File", "Fichier")

                # avoid problems due to the comments of other bots
                translated_text = translated_text.replace('->', '-->').replace('<! -', '<!--').replace('</ ', '</')

                # add the table
                translated_text = '''{| class="wikitable"\n|Language \n|[[''' + name + "|Français]]\n|'''" + target_language + "'''\n|}\n" + translated_text

            # write on the page
                payload={'action':'edit','assert':'user','format':'json','utf8':'','text':translated_text,'summary':summary,'title':en_name,'token':edit_token}
                r4=requests.post(baseurl+'api.php',data=payload,cookies=edit_cookie)


def scrape_datafications():
    """ A helper method to scrape the "Datafications Biographiques" page on wikipast """
    user='MasterBot'
    passw='dhbot2019'
    baseurl='http://wikipast.epfl.ch/wikipast/'

    payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'}
    r1=requests.post(baseurl + 'api.php', data=payload)
    login_token=r1.json()['query']['tokens']['logintoken']
    payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token}
    r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies)
    params3='?format=json&action=query&meta=tokens&continue='
    r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies)
    edit_token=r3.json()['query']['tokens']['csrftoken']
    edit_cookie=r2.cookies.copy()
    edit_cookie.update(r3.cookies)

    result=requests.post(baseurl+'api.php?action=query&titles=Biographies&export&exportnowrap')
    soup=BeautifulSoup(result.text, "lxml")
    code=''
    for primitive in soup.findAll("text"):
        code+=primitive.string

    lines=code.split('| [[')
    names=[]
    for i in range(1,len(lines)):
        name=lines[i].split(']]')[0]
        names.append(name)

    return names