« Code TranslatorBot » : différence entre les versions
Aller à la navigation
Aller à la recherche
Aucun résumé des modifications |
Aucun résumé des modifications |
||
| (Une version intermédiaire par le même utilisateur non affichée) | |||
| Ligne 11 : | Ligne 11 : | ||
baseurl='http://wikipast.epfl.ch/wikipast/' | baseurl='http://wikipast.epfl.ch/wikipast/' | ||
summary='Wikipastbot update' | summary='Wikipastbot update' | ||
# this parameter is the target language in which we want to translate | # this parameter is the target language in which we want to translate | ||
| Ligne 41 : | Ligne 40 : | ||
# we fetch the text we want to translate | # we fetch the text we want to translate | ||
for name in pages: | for name in pages: | ||
translator = Translator() | |||
result=requests.post(baseurl+'api.php?action=query&titles='+name+'&export&exportnowrap') | result=requests.post(baseurl+'api.php?action=query&titles='+name+'&export&exportnowrap') | ||
soup=BeautifulSoup(result.text, "lxml") | soup=BeautifulSoup(result.text, "lxml") | ||
| Ligne 46 : | Ligne 46 : | ||
for primitive in soup.findAll("text"): | for primitive in soup.findAll("text"): | ||
code += primitive.string | code += primitive.string | ||
# create names with english prefix | # create names with english prefix | ||
| Ligne 52 : | Ligne 51 : | ||
# add a table in the french page if it still not exists | # add a table in the french page if it still not exists | ||
if(code | if ("|Langue" not in code) and ("|Language" not in code) : | ||
result=requests.post(baseurl+'api.php?action=query&titles='+en_name+'&export&exportnowrap') | |||
soup=BeautifulSoup(result.text, "lxml") | |||
res='' | |||
for primitive in soup.findAll("text"): | |||
res+=primitive.string | |||
if (res == ''): | |||
code2 = '''{| class="wikitable"\n|Langue \n|''' + "'''Français'''\n|[[" + en_name + "|" + target_language + "]]\n|}\n" + code | |||
payload2={'action':'edit','assert':'user','format':'json','utf8':'','text':code2,'summary':summary,'title':name,'token':edit_token} | |||
r5=requests.post(baseurl+'api.php',data=payload2,cookies=edit_cookie) | |||
# save the links of sources that we won't translate | |||
sources = [] | |||
i=0 | |||
while i< len(code): | |||
if (code[i-1] != '[' and code[i] == '[' and code[i+1] != '[') : | |||
j = i+2 | |||
while(code[j] != ']') : | |||
j += 1 | |||
sources.append(code[i:j+1]) | |||
code = code.replace(code[i:j+1], "&&&", 1) | |||
i = j+1 | |||
else: | |||
i += 1 | |||
# translate the whole text by chunk of approx. 5000 characters. | |||
length = len(code) | |||
chaine ='' | |||
punto = '.' | |||
k = 0 | |||
diminution = 1 | |||
last = k+5000 | |||
while last < length: | |||
if code[last] == punto: | |||
chaine += translator.translate(code[k:last], src = 'fr', dest= target_lang).text | |||
k = last | |||
else: | |||
while code[k+5000-diminution] != punto: | |||
diminution += 1 | |||
chaine += translator.translate(code[k:k+5000-diminution],src = 'fr', dest= target_lang).text | |||
k = k+5000-diminution+1 | |||
diminution = 1 | |||
last += 5000 | |||
last -= 5000 | |||
chaine += translator.translate(code[last:length],src='fr',dest= target_lang).text | |||
translated_text = chaine | |||
# make the hyperlinks point to the correct page while hiding the (en) | |||
for i in range(len(translated_text)): | |||
if (translated_text[i] == '[' and translated_text[i+1] == '[' and translated_text[i+2].isalpha()) : | |||
j = i | |||
while(translated_text[j] != ']') : | |||
j += 1 | |||
m = translated_text[i+2:j] | |||
linkM = "[[" + m + "]]" | |||
translated_text = translated_text.replace(linkM, "[[(" + target_lang + ")_" + m + '|' + m + "]]") | |||
# replace the translates sources by the original ones | |||
M_final_text = translated_text.split("&&&") | |||
final_text ="" | |||
for i in range(len(sources)): | |||
final_text += M_final_text[i] | |||
final_text += sources[i] | |||
translated_text = final_text | |||
if "(en)_File" in translated_text: | |||
translated_text = translated_text.replace("(en)_File", "Fichier") | |||
# avoid problems due to the comments of other bots | |||
translated_text = translated_text.replace('->', '-->').replace('<! -', '<!--').replace('</ ', '</') | |||
# add the table | |||
translated_text = '''{| class="wikitable"\n|Language \n|[[''' + name + "|Français]]\n|'''" + target_language + "'''\n|}\n" + translated_text | |||
# write on the page | |||
payload={'action':'edit','assert':'user','format':'json','utf8':'','text':translated_text,'summary':summary,'title':en_name,'token':edit_token} | |||
r4=requests.post(baseurl+'api.php',data=payload,cookies=edit_cookie) | |||
Dernière version du 8 mai 2019 à 18:47
import requests
import re
from bs4 import BeautifulSoup
from googletrans import Translator
# the function takes a table of strings as argument containing the names of the pages to translate
def main(*args):
user='MasterBot'
passw='dhbot2019'
baseurl='http://wikipast.epfl.ch/wikipast/'
summary='Wikipastbot update'
# this parameter is the target language in which we want to translate
target_lang = 'en'
target_language = 'English'
# login request
payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'}
r1=requests.post(baseurl + 'api.php', data=payload)
# login confirm
login_token=r1.json()['query']['tokens']['logintoken']
payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token}
r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies)
# get edit token2
params3='?format=json&action=query&meta=tokens&continue='
r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies)
edit_token=r3.json()['query']['tokens']['csrftoken']
edit_cookie=r2.cookies.copy()
edit_cookie.update(r3.cookies)
if len(args) == 0:
pages = scrape_datafications()
else:
pages = args
# we fetch the text we want to translate
for name in pages:
translator = Translator()
result=requests.post(baseurl+'api.php?action=query&titles='+name+'&export&exportnowrap')
soup=BeautifulSoup(result.text, "lxml")
code=''
for primitive in soup.findAll("text"):
code += primitive.string
# create names with english prefix
en_name = "(" + target_lang + ")_" + translator.translate(name, src='fr', dest=target_lang).text
# add a table in the french page if it still not exists
if ("|Langue" not in code) and ("|Language" not in code) :
result=requests.post(baseurl+'api.php?action=query&titles='+en_name+'&export&exportnowrap')
soup=BeautifulSoup(result.text, "lxml")
res=''
for primitive in soup.findAll("text"):
res+=primitive.string
if (res == ''):
code2 = '''{| class="wikitable"\n|Langue \n|''' + "'''Français'''\n|[[" + en_name + "|" + target_language + "]]\n|}\n" + code
payload2={'action':'edit','assert':'user','format':'json','utf8':'','text':code2,'summary':summary,'title':name,'token':edit_token}
r5=requests.post(baseurl+'api.php',data=payload2,cookies=edit_cookie)
# save the links of sources that we won't translate
sources = []
i=0
while i< len(code):
if (code[i-1] != '[' and code[i] == '[' and code[i+1] != '[') :
j = i+2
while(code[j] != ']') :
j += 1
sources.append(code[i:j+1])
code = code.replace(code[i:j+1], "&&&", 1)
i = j+1
else:
i += 1
# translate the whole text by chunk of approx. 5000 characters.
length = len(code)
chaine =''
punto = '.'
k = 0
diminution = 1
last = k+5000
while last < length:
if code[last] == punto:
chaine += translator.translate(code[k:last], src = 'fr', dest= target_lang).text
k = last
else:
while code[k+5000-diminution] != punto:
diminution += 1
chaine += translator.translate(code[k:k+5000-diminution],src = 'fr', dest= target_lang).text
k = k+5000-diminution+1
diminution = 1
last += 5000
last -= 5000
chaine += translator.translate(code[last:length],src='fr',dest= target_lang).text
translated_text = chaine
# make the hyperlinks point to the correct page while hiding the (en)
for i in range(len(translated_text)):
if (translated_text[i] == '[' and translated_text[i+1] == '[' and translated_text[i+2].isalpha()) :
j = i
while(translated_text[j] != ']') :
j += 1
m = translated_text[i+2:j]
linkM = "[[" + m + "]]"
translated_text = translated_text.replace(linkM, "[[(" + target_lang + ")_" + m + '|' + m + "]]")
# replace the translates sources by the original ones
M_final_text = translated_text.split("&&&")
final_text =""
for i in range(len(sources)):
final_text += M_final_text[i]
final_text += sources[i]
translated_text = final_text
if "(en)_File" in translated_text:
translated_text = translated_text.replace("(en)_File", "Fichier")
# avoid problems due to the comments of other bots
translated_text = translated_text.replace('->', '-->').replace('<! -', '<!--').replace('</ ', '</')
# add the table
translated_text = '''{| class="wikitable"\n|Language \n|[[''' + name + "|Français]]\n|'''" + target_language + "'''\n|}\n" + translated_text
# write on the page
payload={'action':'edit','assert':'user','format':'json','utf8':'','text':translated_text,'summary':summary,'title':en_name,'token':edit_token}
r4=requests.post(baseurl+'api.php',data=payload,cookies=edit_cookie)
def scrape_datafications():
""" A helper method to scrape the "Datafications Biographiques" page on wikipast """
user='MasterBot'
passw='dhbot2019'
baseurl='http://wikipast.epfl.ch/wikipast/'
payload={'action':'query','format':'json','utf8':'','meta':'tokens','type':'login'}
r1=requests.post(baseurl + 'api.php', data=payload)
login_token=r1.json()['query']['tokens']['logintoken']
payload={'action':'login','format':'json','utf8':'','lgname':user,'lgpassword':passw,'lgtoken':login_token}
r2=requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies)
params3='?format=json&action=query&meta=tokens&continue='
r3=requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies)
edit_token=r3.json()['query']['tokens']['csrftoken']
edit_cookie=r2.cookies.copy()
edit_cookie.update(r3.cookies)
result=requests.post(baseurl+'api.php?action=query&titles=Biographies&export&exportnowrap')
soup=BeautifulSoup(result.text, "lxml")
code=''
for primitive in soup.findAll("text"):
code+=primitive.string
lines=code.split('| [[')
names=[]
for i in range(1,len(lines)):
name=lines[i].split(']]')[0]
names.append(name)
return names