Fonctions PHP utiles #2 (détecter la langue d'un texte)

développement - septembre 2017

$lang_detect = array( // mots les plus usés par langues
'zh' => array('我','你','的','是','了','他','么','们','在','有','这','那','不','什','个','来','要','就','一','人','。'),
'en' => array('the','of','and','to','a','in','is','you','are','for','that','or','it','as','be','on','your','with','can','have'),
'es' => array('de','que','no','a','la','el','y','es','en','lo','un','qué','por','me','una','te','se','los','con','para'),
'hi' => array('की','और','एक','तक','में','है','आप','कि','यह','वह','था','लिए','पर','केवल','सदा','साथ','उसके','वे','मैं','बाद'),
'fr' => array('je','de','est','pas','le','vous','la','tu','que','un','il','et','à','a','ne','les','ce','en','on','ça'),
'pt' => array('que','o','não','de','a','é','você','e','eu','um','se','para','está','uma','me','com','por','ele','em','isso'),
'ar' => array('لا','من','هذا','أن','في','أنا ','على','ما','هل','يا','و','لقد','ذلك','ماذا','أنت','هنا','لم','إلى','نعم','كان'),
'id' => array('aku','tidak','yang','kau','ini','itu','di','dan','akan','apa','dia','anda','kita','untuk','saya','mereka','ada','bisa','tahu','dengan'),
'ru' => array('я','не','что','в','и','ты','это','на','с','он','вы','да','как','мы','мне','а','меня','у','нет','так'),
'nl' => array('ik','je','het','de','is','dat','een','niet','en','wat','van','we','in','ze','op','te','hij','zijn','er','maar'),
'ja' => array('の','に','は','て','を','が','だ','た','する','と','ます','で','ない','いる','も','ある','・','です','「','」'),
'it' => array('non','e','che','di','la','è','il','un','a','per','in','una','sono','mi','ho','si','lo','ha','ma','ti'),
);

function clean_text($t) {
$pattern = array("\n","+",",",".","’","'","\"","&","!","?",":",";","#","~","=","/","$","£","€","^","(",")","[","]","{","}","_","<",">" ,"。");
return str_replace($pattern, ' ', mb_convert_case($t, MB_CASE_LOWER, "UTF-8"));
}

function language_detect($t, $a) { // text, arrays
$t = clean_text($t);
$res = array();
if (strstr($t, ' '))
$words = explode(' ', $t);
else {
$words = str_split($t); // ne sert pas pour le chinois, voir plus bas
//$words = array();
//if (preg_match("/\p{Han}+/u", $t))
// $res['zh'] = 100;
}
foreach ($words as $w) { // pour chaque mot
foreach ($a as $k => $l) { // à chaque langue
if (in_array($w, $l)) // si le mot est contenu
$res[$k]++;
}
}
arsort($res);
$kres = array_keys($res);
$l = array_shift($kres);
if (count($res) == 0 and preg_match("/\p{Han}+/u", $t)) // chinois
return 'zh';
elseif (count($res) > 0)
return $l; // langue trouvée
else
return 'en'; // langue par défaut
}


// EXEMPLE
$texte = 'Si tu vas à Rio, N\'oublie pas de monter là-haut ; Dans un petit village, Caché sous les fleurs sauvages';

echo language_detect($texte, $lang_detect); // retourne fr