fly512
(我本如是)
版主
  
人生的另一半
UID 42566
精华
0
积分 2489
帖子 2373
金钱 1759 喜悦币
威望 0
人脉 730
阅读权限 100
注册 2004-10-31 来自 亚特兰迪斯
状态 离线
|
[推荐阅读] 请教一下循环问题
<?php function keyword($body,$title='') { $sp = new SplitWord(); $titleindexs = explode(" ",trim($sp->GetIndexText($sp->SplitRMM($title)))); $allindexs = explode(" ",trim($sp->GetIndexText($sp->SplitRMM(Html2Text($body)),200))); if(is_array($allindexs) && is_array($titleindexs)){ foreach($titleindexs as $k){ if(strlen($keywords)>=20) break; else $keywords .= $k."|"; } foreach($allindexs as $k){ if(strlen($keywords)>=20) break; else if(!in_array($k,$titleindexs)) $keywords .= $k."|"; } } $sp->Clear(); unset($sp); $keywords = addslashes($keywords); if(substr($keywords,strlen($keywords)-1,strlen($keywords))=='|'){ return substr($keywords,0,strlen($keywords)-1); }elseif(substr($keywords,0,1)=='|'){ return substr($keywords,0,1) ; }else{ return $keywords; } }
class SplitWord { var $RankDic = Array(); var $OneNameDic = Array(); var $TwoNameDic = Array(); var $NewWord = Array(); var $SourceString = ''; var $ResultString = ''; var $SplitChar = ' '; //分隔符 var $SplitLen = 4; //保留词长度 var $EspecialChar = "和|的|是"; var $NewWordLimit = "在|的|与|或|就|你|我|他|她|有|了|是|其|能|对|地"; //这里可以按需要加入常用的量词, //程序会检测词语第一个字是否为这些词和上一个词是否为数词,然后结合为单词 var $CommonUnit = "年|月|日|时|分|秒|点|元|百|千|万|亿|位|辆"; var $CnNumber = "0|1|2|3|4|5|6|7|8|9|+|-|%|.|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s |t|u|v|w|x|y|z|A|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z"; var $CnSgNum = "一|二|三|四|五|六|七|八|九|十|百|千|万|亿|数"; var $MaxLen = 11; //词典最大 7 中文字,这里的数值为字节数组的最大索引 var $MinLen = 3; //最小 2 中文字,这里的数值为字节数组的最大索引 var $CnTwoName = "端木 南宫 谯笪 轩辕 令狐 钟离 闾丘 长孙 鲜于 宇文 司徒 司空 上官 欧阳 公孙 西门 东门 左丘 东郭 呼延 慕容 司马 夏侯 诸葛 东方 赫连 皇甫 尉迟 申屠"; var $CnOneName = "赵钱孙李周吴郑王冯陈褚卫蒋沈韩杨朱秦尤许何吕施张孔曹严华金魏陶姜戚谢邹喻柏水窦章云苏潘葛奚范彭郎鲁韦昌马苗凤花方俞任袁柳酆鲍史唐费廉岑薛雷贺倪汤滕殷罗毕郝邬安常乐于时傅皮卡齐康伍余元卜顾孟平黄穆萧尹姚邵堪汪祁毛禹狄米贝明臧计伏成戴谈宋茅庞熊纪舒屈项祝董粱杜阮蓝闵席季麻强贾路娄危江童颜郭梅盛林刁钟徐邱骆高夏蔡田樊胡凌霍虞万支柯咎管卢莫经房裘缪干解应宗宣丁贲邓郁单杭洪包诸左石崔吉钮龚程嵇邢滑裴陆荣翁荀羊於惠甄魏加封芮羿储靳汲邴糜松井段富巫乌焦巴弓牧隗谷车侯宓蓬全郗班仰秋仲伊宫宁仇栾暴甘钭厉戎祖武符刘姜詹束龙叶幸司韶郜黎蓟薄印宿白怀蒲台从鄂索咸籍赖卓蔺屠蒙池乔阴郁胥能苍双闻莘党翟谭贡劳逄姬申扶堵冉宰郦雍郤璩桑桂濮牛寿通边扈燕冀郏浦尚农温别庄晏柴翟阎充慕连茹习宦艾鱼容向古易慎戈廖庚终暨居衡步都耿满弘匡国文寇广禄阙东殴殳沃利蔚越夔隆师巩厍聂晁勾敖融冷訾辛阚那简饶空曾沙须丰巢关蒯相查后江游竺"; //------------------------------ //php4构造函数 //------------------------------ function SplitWord(){ $this->__construct(); } //------------------------------ //php5构造函数 //------------------------------ function __construct(){ //载入姓氏词典 for($i=0;$i<strlen($this->CnOneName);$i++){ $this->OneNameDic[$this->CnOneName[$i].$this->CnOneName[$i+1]] = 1; $i++; } $twoname = explode(" ",$this->CnTwoName); foreach($twoname as $n){ $this->TwoNameDic[$n] = 1; } unset($twoname); unset($this->CnTwoName); unset($this->CnOneName); //高级分词,预先载入词典以提分词高速度 $dicfile = dirname(__FILE__)."/table/word.csv"; $fp = fopen($dicfile,'r'); while($line = fgets($fp,64)){ $ws = explode(' ',$line); $this->RankDic[strlen($ws[0])][$ws[0]] = $ws[1]; } fclose($fp); } //-------------------------- //析放资源 //-------------------------- function Clear() { unset($this->RankDic); } //---------------------------- //设置源字符串 //---------------------------- function SetSource($str){ $this->SourceString = trim($this->ReviseString($str)); $this->ResultString = ""; } //----------------------------- //检查字符串是否不存在中文 //----------------------------- function NotGBK($str) { if($str=="") return ""; //因为粗分的时候已经处理,因此不必要检查所的字符 if( ord($str[0])>0x80 ) return false; else return true; } //----------------------------- //RMM分词算法 //----------------------------- function SplitRMM($str="",$tryNumName=true,$tryDiff=true){ if($str!="") $this->SetSource(trim($str)); if($this->SourceString=="") return ""; //对文本进行粗分 $this->SourceString = $this->ReviseString($this->SourceString); //对特定文本进行分离 $spwords = explode(" ",$this->SourceString); $spLen = count($spwords); $spc = $this->SplitChar; for($i=($spLen-1);$i>=0;$i--){ if(trim($spwords[$i])=="") continue; if($this->NotGBK($spwords[$i])){ $this->ResultString = $spwords[$i].$spc.$this->ResultString; } else { $c = $spwords[$i][0].$spwords[$i][1]; $n = hexdec(bin2hex($c)); if($c=="《") //书名 { $this->ResultString = $spwords[$i].$spc.$this->ResultString; } else if($n>0xA13F && $n < 0xAA40) //标点符号 { $this->ResultString = $spwords[$i].$spc.$this->ResultString; } else //正常短句 { if(strlen($spwords[$i]) <= $this->SplitLen) { //如果结束符为特殊分割词,分离处理 if(preg_match("/".$this->EspecialChar."$/",$spwords[$i],$regs)){ $spwords[$i] = preg_replace("/".$regs[0]."$/","",$spwords[$i]).$spc.$regs[0]; } //是否为常用单位 if(!preg_match("/^".$this->CommonUnit."/",$spwords[$i]) || $i==0){ $this->ResultString = $spwords[$i].$spc.$this->ResultString; }else{ $this->ResultString = $spwords[$i-1].$spwords[$i].$spc.$this->ResultString; $i--; } } else{ $this->ResultString = $this->RunRMM($spwords[$i],$tryNumName,$tryDiff).$spc.$this->ResultString; } } } } $this->ResultString = preg_replace("/ {1,}/"," ",$this->ResultString); //return $this->ParNumber($this->ResultString); return $this->ResultString; } //------------------------ //对常规数量词进行识别 //------------------------ function ParNumber($str){ if($str == "") return ""; $ws = explode(' ',$str); $wlen = count($ws); $spc = $this->SplitChar; $reStr = ""; for($i=0;$i<$wlen;$i++){ if($ws[$i]=="") continue; if($i>=$wlen-1) $reStr .= $spc.$ws[$i]; else{ $reStr .= $spc.$ws[$i]; } } return $reStr; } //------------------------------- //进行名字识别和其它数词识别 //-------------------------------- function ParOther($WordArray) { $wlen = count($WordArray)-1; $rsStr = ""; $spc = $this->SplitChar; for($i=$wlen;$i>=0;$i--) { //数量词 if(preg_match("/".$this->CnSgNum."/",$WordArray[$i])){ $rsStr .= $spc.$WordArray[$i]; if($i>0 && preg_match("/^".$this->CommonUnit."/",$WordArray[$i-1])) { $rsStr .= $WordArray[$i-1]; $i--; } else{ while($i>0 && preg_match("/".$this->CnSgNum."/",$WordArray[$i-1])) { $rsStr .= $WordArray[$i-1]; $i--; } } continue; } //双字姓 if(strlen($WordArray[$i])==4 && isset($this->TwoNameDic[$WordArray[$i]])) { $rsStr .= $spc.$WordArray[$i]; if($i>0&&strlen($WordArray[$i-1])==2){ $rsStr .= $WordArray[$i-1];$i--; if($i>0&&strlen($WordArray[$i-1])==2){ $rsStr .= $WordArray[$i-1];$i--; } } } //单字姓 else if(strlen($WordArray[$i])==2 && isset($this->OneNameDic[$WordArray[$i]])) { $rsStr .= $spc.$WordArray[$i]; if($i>0&&strlen($WordArray[$i-1])==2){ if(preg_match("/".$this->EspecialChar."/",$WordArray[$i-1])) continue; $rsStr .= $WordArray[$i-1];$i--; if($i>0 && strlen($WordArray[$i-1])==2 && !preg_match("/".$this->EspecialChar."/",$WordArray[$i-1])) { $rsStr .= $WordArray[$i-1];$i--; } } } //普通词汇 else{ $rsStr .= $spc.$WordArray[$i]; } } //返回本段分词结果 $rsStr = preg_replace("/^".$spc."/","",$rsStr); return $rsStr; } //对全中文字符串进行逆向匹配方式分解 function RunRMM($str,$tryNumName=true,$tryDiff=true) { $spc = $this->SplitChar; $spLen = strlen($str); $rsStr = ""; $okWord = ""; $tmpWord = ""; $WordArray = Array(); //逆向字典匹配 for($i=($spLen-1);$i>=0;) { //当i达到最小可能词的时候 if($i<=$this->MinLen){ if($i==1){ $WordArray[] = substr($str,0,2); }else{ $w = substr($str,0,$this->MinLen+1); if($this->IsWord($w)){ $WordArray[] = $w; }else{ $WordArray[] = substr($str,2,2); $WordArray[] = substr($str,0,2); } } $i = -1; break; } //分析在最小词以上时的情况 if($i>=$this->MaxLen) $maxPos = $this->MaxLen; else $maxPos = $i; $isMatch = false; for($j=$maxPos;$j>=0;$j=$j-2){ $w = substr($str,$i-$j,$j+1); if($this->IsWord($w)){ $WordArray[] = $w; $i = $i-$j-1; $isMatch = true; break; } } if(!$isMatch){ if($i>1) { $WordArray[] = $str[$i-1].$str[$i]; $i = $i-2; } } }//End For //名字和数量词识别 if($tryNumName) { $rsStr = $this->ParOther($WordArray); } else{ $wlen = count($WordArray)-1; for($i=$wlen;$i>=0;$i--){ $rsStr .= $spc.$WordArray[$i]; } } //消岐处理 if($tryDiff) $rsStr = $this->TestDiff(trim($rsStr)); return $rsStr; } //---------------------------------- //对分词结果进行消岐处理 //---------------------------------- function TestDiff($str){ $str = preg_replace("/ {1,}/"," ",$str); if($str == ""||$str == " ") return ""; $ws = explode(' ',$str); $wlen = count($ws); $spc = $this->SplitChar; $reStr = ""; for($i=0;$i<$wlen;$i++){ //循环到最后一个词不处理 if($i>=($wlen-1)) { $reStr .= $spc.$ws[$i]; } //其它词的处理 else{ //叠词规则 if($ws[$i]==$ws[$i+1]){ $reStr .= $spc.$ws[$i].$ws[$i+1]; $i++; continue; } //单字词和二三字词之间的岐义处理 if(strlen($ws[$i])==2 && strlen($ws[$i+1])<8 && strlen($ws[$i+1])>2){ $addw = $ws[$i].$ws[$i+1]; $t = 6; $testok = false; while($t>=4){ $w = substr($addw,0,$t); if($this->IsWord($w) && ($this->GetRank($w) > $this->GetRank($ws[$i+1])*2) ){ $limitW = substr($ws[$i+1],strlen($ws[$i+1])-$t-2,strlen($ws[$i+1])-strlen($w)+2); if($limitW!="") $reStr .= $spc.$w.$spc.$limitW; else $reStr .= $spc.$w; $testok = true; break; } $t = $t-2; } if(!$testok) $reStr .= $spc.$ws[$i]; else $i++; } //前后均为二字到三字的词进行交叉岐义处理 else if(strlen($ws[$i])>2 && strlen($ws[$i])<8 && strlen($ws[$i+1])>2 && strlen($ws[$i+1])<8) { $t21 = substr($ws[$i+1],0,2); $t22 = substr($ws[$i+1],0,4); //如果上一个词接下一个词的首字为词 if($this->IsWord($ws[$i].$t21)){ if(strlen($ws[$i])==6||strlen($ws[$i+1])==6){ $reStr .= $spc.$ws[$i].$t21.$spc.substr($ws[$i+1],2,strlen($ws[$i+1])-2); $i++; }else{ $reStr .= $spc.$ws[$i]; } } //对于下一个词为3字词或2字词进行不同的处理 else if(strlen($ws[$i+1])==6){ if($this->IsWord($ws[$i].$t22)){ $reStr .= $spc.$ws[$i].$t22.$spc.$ws[$i+1][4].$ws[$i+1][5]; $i++; }else{ $reStr .= $spc.$ws[$i]; } } // //两字词交叉识别,视情况选择 // else if(strlen($ws[$i+1])==4){ $addw = $ws[$i].$ws[$i+1]; $t = strlen($ws[$i+1])-2; $testok = false; while($t>0){ $w = substr($addw,0,strlen($ws[$i])+$t); if($this->IsWord($w) && ($this->GetRank($w) > $this->GetRank($ws[$i+1])*2) ) { $limitW = substr($ws[$i+1],$t,strlen($ws[$i+1])-$t); if( | |