« Code TranslatorBot » : différence entre les versions
Aller à la navigation
Aller à la recherche
Aucun résumé des modifications |
Aucun résumé des modifications |
||
Ligne 46 : | Ligne 46 : | ||
for primitive in soup.findAll("text"): | for primitive in soup.findAll("text"): | ||
code += primitive.string | code += primitive.string | ||
# create names with english prefix | # create names with english prefix | ||
Ligne 52 : | Ligne 51 : | ||
# add a table in the french page if it still not exists | # add a table in the french page if it still not exists | ||
if(code | if ("|Langue" not in code) and ("|Language" not in code) : | ||
code2 = '''{| class="wikitable"\n|Langue \n|''' + "'''Français'''\n|[[" + en_name + "|" + target_language + "]]\n|}\n" + code | code2 = '''{| class="wikitable"\n|Langue \n|''' + "'''Français'''\n|[[" + en_name + "|" + target_language + "]]\n|}\n" + code | ||
Ligne 112 : | Ligne 111 : | ||
final_text += sources[i] | final_text += sources[i] | ||
translated_text = final_text | translated_text = final_text | ||
if "(en)_File" in translated_text: | |||
translated_text = translated_text.replace("(en)_File", "Fichier") | |||
# avoid problems due to the comments of other bots | # avoid problems due to the comments of other bots |
Version du 8 mai 2019 à 17:52
import requests import re from bs4 import BeautifulSoup from googletrans import Translator # the function takes a table of strings as argument containing the names of the pages to translate def main(*args): user='MasterBot' passw='dhbot2019' baseurl='http://wikipast.epfl.ch/wikipast/' summary='Wikipastbot update' translator = Translator() # this parameter is the target language in which we want to translate target_lang = 'en' target_language = 'English' # login request payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'} r1=requests.post(baseurl + 'api.php', data=payload) # login confirm login_token=r1.json()['query']['tokens']['logintoken'] payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token} r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies) # get edit token2 params3='?format=json&action=query&meta=tokens&continue=' r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies) edit_token=r3.json()['query']['tokens']['csrftoken'] edit_cookie=r2.cookies.copy() edit_cookie.update(r3.cookies) if len(args) == 0: pages = scrape_datafications() else: pages = args # we fetch the text we want to translate for name in pages: result=requests.post(baseurl+'api.php?action=query&titles='+name+'&export&exportnowrap') soup=BeautifulSoup(result.text, "lxml") code='' for primitive in soup.findAll("text"): code += primitive.string # create names with english prefix en_name = "(" + target_lang + ")_" + translator.translate(name, src='fr', dest=target_lang).text # add a table in the french page if it still not exists if ("|Langue" not in code) and ("|Language" not in code) : code2 = '''{| class="wikitable"\n|Langue \n|''' + "'''Français'''\n|[[" + en_name + "|" + target_language + "]]\n|}\n" + code payload2={'action':'edit','assert':'user','format':'json','utf8':'','text':code2,'summary':summary,'title':name,'token':edit_token} r5=requests.post(baseurl+'api.php',data=payload2,cookies=edit_cookie) # save the links of sources that we won't translate sources = [] i=0 while i< len(code): if (code[i-1] != '[' and code[i] == '[' and code[i+1] != '[') : j = i+2 while(code[j] != ']') : j += 1 sources.append(code[i:j+1]) code = code.replace(code[i:j+1], "&&&", 1) i = j+1 else: i += 1 # translate the whole text by chunk of approx. 5000 characters. length = len(code) chaine ='' punto = '.' k = 0 diminution = 1 last = k+5000 while last < length: if code[last] == punto: chaine += translator.translate(code[k:last], src = 'fr', dest= target_lang).text k = last else: while code[k+5000-diminution] != punto: diminution += 1 chaine += translator.translate(code[k:k+5000-diminution],src = 'fr', dest= target_lang).text k = k+5000-diminution+1 diminution = 1 last += 5000 last -= 5000 chaine += translator.translate(code[last:length],src='fr',dest= target_lang).text translated_text = chaine # make the hyperlinks point to the correct page while hiding the (en) for i in range(len(translated_text)): if (translated_text[i] == '[' and translated_text[i+1] == '[' and translated_text[i+2].isalpha()) : j = i while(translated_text[j] != ']') : j += 1 m = translated_text[i+2:j] linkM = "[[" + m + "]]" translated_text = translated_text.replace(linkM, "[[(" + target_lang + ")_" + m + '|' + m + "]]") # replace the translates sources by the original ones M_final_text = translated_text.split("&&&") final_text ="" for i in range(len(sources)): final_text += M_final_text[i] final_text += sources[i] translated_text = final_text if "(en)_File" in translated_text: translated_text = translated_text.replace("(en)_File", "Fichier") # avoid problems due to the comments of other bots translated_text = translated_text.replace('->', '-->').replace('<! -', '<!--').replace('</ ', '</') # add the table translated_text = '''{| class="wikitable"\n|Language \n|[[''' + name + "|Français]]\n|'''" + target_language + "'''\n|}\n" + translated_text # write on the page payload={'action':'edit','assert':'user','format':'json','utf8':'','text':translated_text,'summary':summary,'title':en_name,'token':edit_token} r4=requests.post(baseurl+'api.php',data=payload,cookies=edit_cookie) def scrape_datafications(): """ A helper method to scrape the "Datafications Biographiques" page on wikipast """ user='MasterBot' passw='dhbot2019' baseurl='http://wikipast.epfl.ch/wikipast/' payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'} r1=requests.post(baseurl + 'api.php', data=payload) login_token=r1.json()['query']['tokens']['logintoken'] payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token} r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies) params3='?format=json&action=query&meta=tokens&continue=' r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies) edit_token=r3.json()['query']['tokens']['csrftoken'] edit_cookie=r2.cookies.copy() edit_cookie.update(r3.cookies) result=requests.post(baseurl+'api.php?action=query&titles=Biographies&export&exportnowrap') soup=BeautifulSoup(result.text, "lxml") code='' for primitive in soup.findAll("text"): code+=primitive.string lines=code.split('| [[') names=[] for i in range(1,len(lines)): name=lines[i].split(']]')[0] names.append(name) return names