Fonctions PHP utiles #2 (détecter la langue d'un texte)

Rédigé par beubeulone - - Aucun commentaire
$lang_detect = array(		// mots les plus usés par langues
	'zh' => array('我','你','的','是','了','他','么','们','在','有','这','那','不','什','个','来','要','就','一','人','。'),
	'en' => array('the','of','and','to','a','in','is','you','are','for','that','or','it','as','be','on','your','with','can','have'),
	'es' => array('de','que','no','a','la','el','y','es','en','lo','un','qué','por','me','una','te','se','los','con','para'),
	'hi' => array('की','और','एक','तक','में','है','आप','कि','यह','वह','था','लिए','पर','केवल','सदा','साथ','उसके','वे','मैं','बाद'),
	'fr' => array('je','de','est','pas','le','vous','la','tu','que','un','il','et','à','a','ne','les','ce','en','on','ça'),
	'pt' => array('que','o','não','de','a','é','você','e','eu','um','se','para','está','uma','me','com','por','ele','em','isso'),
	'ar' => array('لا','من','هذا','أن','في','أنا ','على','ما','هل','يا','و','لقد','ذلك','ماذا','أنت','هنا','لم','إلى','نعم','كان'),
	'id' => array('aku','tidak','yang','kau','ini','itu','di','dan','akan','apa','dia','anda','kita','untuk','saya','mereka','ada','bisa','tahu','dengan'),
	'ru' => array('я','не','что','в','и','ты','это','на','с','он','вы','да','как','мы','мне','а','меня','у','нет','так'),
	'nl' => array('ik','je','het','de','is','dat','een','niet','en','wat','van','we','in','ze','op','te','hij','zijn','er','maar'),
	'ja' => array('の','に','は','て','を','が','だ','た','する','と','ます','で','ない','いる','も','ある','・','です','「','」'),
	'it' => array('non','e','che','di','la','è','il','un','a','per','in','una','sono','mi','ho','si','lo','ha','ma','ti'),
	);

function clean_text($t) {
	$pattern = array("\n","+",",",".","’","'","\"","&","!","?",":",";","#","~","=","/","$","£","€","^","(",")","[","]","{","}","_","<",">"	,"。");
	return str_replace($pattern, ' ', mb_convert_case($t, MB_CASE_LOWER, "UTF-8"));
	}

function language_detect($t, $a) {	// text, arrays
	$t = clean_text($t);
	$res = array();
	if (strstr($t, ' '))
		$words = explode(' ', $t);
	else {
		$words = str_split($t);	// ne sert pas pour le chinois, voir plus bas
		//$words = array();
		//if (preg_match("/\p{Han}+/u", $t))
		//	$res['zh'] = 100;
		}
	foreach ($words as $w) {	// pour chaque mot
		foreach ($a as $k => $l) {	// à chaque langue
			if (in_array($w, $l))	// si le mot est contenu
				$res[$k]++;
			}
		}
	arsort($res);
	$kres = array_keys($res);
	$l = array_shift($kres);
	if (count($res) == 0 and preg_match("/\p{Han}+/u", $t))	// chinois
		return 'zh';
	elseif (count($res) > 0)
		return $l;		// langue trouvée
	else
		return 'en';	// langue par défaut
	}


// EXEMPLE
$texte = 'Si tu vas à Rio, N\'oublie pas de monter là-haut ; Dans un petit village, Caché sous les fleurs sauvages';

echo language_detect($texte, $lang_detect);		// retourne fr
Fil RSS des articles