Results and script below, used CTranslate2 to implement batching and speed up the process significantly.
{
"de-en": 60.8226,
"en-de": 53.13953,
"en-ar": 1.02682,
"en-es": 39.97683,
"ar-en": 50.60414,
"en-hi": 68.08493,
"en-ga": 65.7327,
"en-fr": 41.41509,
"en-hu": 49.62286,
"en-fi": 37.09572,
"en-ja": 44.10758,
"en-id": 36.87964,
"en-it": 44.2624,
"en-ko": 19.88853,
"en-nl": 27.15469,
"en-pt": 71.85321,
"en-pl": 23.35478,
"en-sv": 67.25956,
"en-uk": 38.07533,
"es-en": 36.0669,
"fi-en": 33.93363,
"fr-en": 20.05843,
"hi-en": 61.38313,
"id-en": 33.37866,
"ga-en": 59.74705,
"it-en": 49.87469,
"hu-en": 26.40585,
"pt-en": 89.59558,
"pl-en": 27.50059,
"ko-en": 37.24427,
"ja-en": 35.97562,
"sv-en": 56.56506,
"ru-en": 50.61911,
"nl-en": 27.47269,
"ca-en": 42.22077,
"cs-en": 52.70498,
"da-en": 35.89213,
"el-en": 38.56146,
"az-en": 9.62307,
"en-cs": 58.30419,
"en-az": 5.73751,
"en-ca": 63.21218,
"en-da": 47.056,
"en-eo": 26.75908,
"en-el": 44.01217,
"en-fa": 39.42022,
"en-he": 47.65576,
"en-ru": 50.98799,
"en-sk": 43.0692,
"en-zh": 50.45857,
"en-tr": 18.698,
"en-th": 35.72085,
"eo-en": 24.34548,
"he-en": 40.87901,
"sk-en": 32.57252,
"th-en": 22.66277,
"fa-en": 40.90661
}
import time
import os
from argostranslate import package as packageManager
from sacrebleu import corpus_bleu
import sentencepiece
import ctranslate2
import threading
floresLoc = "E:\\TranslationData\\flores200_dataset\\dev\\" # downloaded files from flores
nllb_langs = {
"af":"afr_Latn",
"ak":"aka_Latn",
"am":"amh_Ethi",
"ar":"arb_Arab",
"as":"asm_Beng",
"ay":"ayr_Latn",
"az":"azj_Latn",
"bm":"bam_Latn",
"be":"bel_Cyrl",
"bn":"ben_Beng",
"bho":"bho_Deva",
"bs":"bos_Latn",
"bg":"bul_Cyrl",
"ca":"cat_Latn",
"ceb":"ceb_Latn",
"cs":"ces_Latn",
"ckb":"ckb_Arab",
"tt":"crh_Latn",
"cy":"cym_Latn",
"da":"dan_Latn",
"de":"deu_Latn",
"el":"ell_Grek",
"en":"eng_Latn",
"eo":"epo_Latn",
"et":"est_Latn",
"eu":"eus_Latn",
"ee":"ewe_Latn",
"fa":"pes_Arab",
"fi":"fin_Latn",
"fr":"fra_Latn",
"gd":"gla_Latn",
"ga":"gle_Latn",
"gl":"glg_Latn",
"gn":"grn_Latn",
"gu":"guj_Gujr",
"ht":"hat_Latn",
"ha":"hau_Latn",
"he":"heb_Hebr",
"hi":"hin_Deva",
"hr":"hrv_Latn",
"hu":"hun_Latn",
"hy":"hye_Armn",
"nl":"nld_Latn",
"ig":"ibo_Latn",
"ilo":"ilo_Latn",
"id":"ind_Latn",
"is":"isl_Latn",
"it":"ita_Latn",
"jv":"jav_Latn",
"ja":"jpn_Jpan",
"kn":"kan_Knda",
"ka":"kat_Geor",
"kk":"kaz_Cyrl",
"km":"khm_Khmr",
"rw":"kin_Latn",
"ko":"kor_Hang",
"ku":"kmr_Latn",
"lo":"lao_Laoo",
"lv":"lvs_Latn",
"ln":"lin_Latn",
"lt":"lit_Latn",
"lb":"ltz_Latn",
"lg":"lug_Latn",
"lus":"lus_Latn",
"mai":"mai_Deva",
"ml":"mal_Mlym",
"mr":"mar_Deva",
"mk":"mkd_Cyrl",
"mg":"plt_Latn",
"mt":"mlt_Latn",
"mni-Mtei":"mni_Beng",
"mni":"mni_Beng",
"mn":"khk_Cyrl",
"mi":"mri_Latn",
"ms":"zsm_Latn",
"my":"mya_Mymr",
"no":"nno_Latn",
"ne":"npi_Deva",
"ny":"nya_Latn",
"om":"gaz_Latn",
"or":"ory_Orya",
"pl":"pol_Latn",
"pt":"por_Latn",
"ps":"pbt_Arab",
"qu":"quy_Latn",
"ro":"ron_Latn",
"ru":"rus_Cyrl",
"sa":"san_Deva",
"si":"sin_Sinh",
"sk":"slk_Latn",
"sl":"slv_Latn",
"sm":"smo_Latn",
"sn":"sna_Latn",
"sd":"snd_Arab",
"so":"som_Latn",
"es":"spa_Latn",
"sq":"als_Latn",
"sr":"srp_Cyrl",
"su":"sun_Latn",
"sv":"swe_Latn",
"sw":"swh_Latn",
"ta":"tam_Taml",
"te":"tel_Telu",
"tg":"tgk_Cyrl",
"tl":"tgl_Latn",
"th":"tha_Thai",
"ti":"tir_Ethi",
"ts":"tso_Latn",
"tk":"tuk_Latn",
"tr":"tur_Latn",
"ug":"uig_Arab",
"uk":"ukr_Cyrl",
"ur":"urd_Arab",
"uz":"uzn_Latn",
"vi":"vie_Latn",
"xh":"xho_Latn",
"yi":"ydd_Hebr",
"yo":"yor_Latn",
"zh-CN":"zho_Hans",
"zh":"zho_Hans",
"zh-TW":"zho_Hant",
"zu":"zul_Latn",
"pa":"pan_Guru"
}
bleu_scores = {}
def returnTranslator(file_loc) -> dict:
model = ctranslate2.Translator(f"{file_loc}/model", device="cuda", compute_type="auto")
tokenizer = sentencepiece.SentencePieceProcessor(
f"{file_loc}/sentencepiece.model"
)
return {"model": model, "tokenizer": tokenizer}
def encode(text, tokenizer: sentencepiece.SentencePieceProcessor):
return tokenizer.Encode(text, out_type=str)
def decode(tokens, tokenizer: sentencepiece.SentencePieceProcessor):
return tokenizer.Decode(tokens)
def processFlores(pkg):
data = returnTranslator(pkg.package_path)
src_text = floresLoc + nllb_langs[pkg.from_code] + ".dev"
tgt_text = floresLoc + nllb_langs[pkg.to_code] + ".dev"
src_text = [line.rstrip('\n') for line in open(src_text, encoding="utf-8")]
tgt_text = [line.rstrip('\n') for line in open(tgt_text, encoding="utf-8")]
translation_obj = data["model"].translate_batch(
encode(src_text, data["tokenizer"]),
beam_size=2,
return_scores=False, # speed up
)
translated_text = [
decode(tokens.hypotheses[0], data["tokenizer"])
for tokens in translation_obj
]
bleu_scores[f"{pkg.from_code}-{pkg.to_code}"] = round(corpus_bleu(
translated_text, [[x] for x in tgt_text], tokenize="flores200"
).score, 5)
print(f"{pkg.from_code}-{pkg.to_code}: {bleu_scores[f'{pkg.from_code}-{pkg.to_code}']}")
promises = []
for package in packageManager.get_installed_packages():
THREAD = threading.Thread(target=processFlores, args=[package,])
promises.append(THREAD)
executing = 0
for x in promises:
executing = sum(1 for x in promises if x.is_alive())
while executing >= 5:
executing = sum(1 for x in promises if x.is_alive())
time.sleep(1)
if executing <= 5:
x.start()
# write dict as json to file
import json
with open("bleu_scores.json", "w") as outfile:
outfile.write(json.dumps(bleu_scores, indent=4))