PHP制作百度词典查词采集器

6年以前  |  阅读数:1047 次  |  编程语言:PHP 

百度dict 采集样本

写的采集百度dict词典翻译后的所有结果数据,当然附带了13.5w单词库和采集简单的案例,这里我把写出的主要类dict.class.php放出来,项目地址http://github.com/widuu/baidu_dict,有需要的直接fork就可以了~么么哒,这东西用的人很少,所以有用的兄弟拿走了哈~


    <?php
    /**
     * dict.class.php 采集百度词典翻译内容
     *
     * @copyright      (C) 2014 widuu
     * @license       http://www.widuu.com
     * @lastmodify     2014-2-15
     */


    header("content-type:text/html;charset=utf8");
    class Dict{

        private $word;

        //显示的条数
        private static $num = 10;

        public function __construct(){}


        /**
       * 公用返回百度采集数据的方法
       * @param string 英文单词
       * retun array(
         *              symbol" => 音标
         *              "pro"    => 发音
         *              "example"=> 例句
         *              "explain"=> 简明释义
         *              "synonym"=> 同反义词
         *              "phrase" => 短语数组
         *          )
       *
         */
        public function content($word){
             $this -> word = $word;
             $symbol = $this -> Pronounced();
             $pro    = $this->getSay();
             $example = $this -> getExample();
             $explain = $this -> getExplain();
             $synonym = $this -> getSynonym();
             $phrase = $this -> getPhrase();
             $result = array(
                    "symbol" => $symbol,        //音标
                    "pro"    => $pro,           //发音
                    "example"=> $example,       //例句
                    "explain"=> $explain,       //简明释义
                    "synonym"=> $synonym,       //同反义词
                    "phrase" => $phrase         //短语数组
                );
            return $result;
        }


        /**
       * 远程获取百度翻译内容
       * get function curl
       * retun string
       *
         */

        private function getContent(){
            $useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0";
            $ch = curl_init();
            $url = "http://dict.baidu.com/s?wd=".$this->word;
            curl_setopt($ch, CURLOPT_URL, $url);
            curl_setopt($ch, CURLOPT_USERAGENT,$useragent);
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); 
            curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); 
            curl_setopt($ch, CURLOPT_HTTPGET, 1);
            curl_setopt($ch, CURLOPT_AUTOREFERER,1);
            curl_setopt($ch, CURLOPT_HEADER, 0); 
            curl_setopt($ch, CURLOPT_TIMEOUT, 30);
            $result = curl_exec($ch);
            if (curl_errno($curl)) {
                echo 'Errno'.curl_error($curl);
            }
            curl_close($ch);
            return $result;
        }


        /**
       * 获取百度翻译发音
       * retun array(英,美)
       *
         */

        private function Pronounced(){
            $data = $this -> getContent();
            preg_match_all("/\"EN\-US\"\>(.*)\<\/b\>/Ui",$data,$pronounced);
            return array(
                'en' => $pronounced[1][0],
                'us' => $pronounced[1][1]
            );
        }

        /**
         * 获取百度翻译发音
         * return array(英,美)
         *
         */

        private function getSay(){
            $data = $this -> getContent();
            preg_match_all("/url=\"(.*)\"/Ui",$data,$pronounced);
            return array(
                'en' => $pronounced[1][0],
                'us' => $pronounced[1][1]
            );  
        }

        /**
       * 获取百度翻译例句
       * return array() 多维数组 例句
       * 
         */

        private function getExample(){
            $str = "";
            $data = $this -> getContent();
            preg_match_all("/var example_data = (.*)\]\;/Us",$data,$example);
          $data1 = "[[[".ltrim($example[1][0],"[");
          $data2 = explode("[[[",$data1);
          $num = count(array_filter($data2));
            foreach($data2 as $key => $value){
                $data3 = explode("[[","[[".$value);
                foreach ($data3 as $k => $v) {
                    preg_match_all("/\[\"(.*)\",/Us","[".$v, $match);
                    if(!empty($match[1])){
                        $str .= implode($match[1]," ")."@";
                    }
                }
            }
            $data4 = trim($str,"@");
            $data5 = explode("@", $data4);
            $result = array_chunk($data5, 2);
            return $result;
        }

        /**
       * 获取简明释义
       * return array (x => "词性",b => "附属")
       * 
         **/

        private function getExplain(){
            $data = $this -> getContent();
            preg_match_all("/id\=\"en\-simple\-means\"\>(.*)\<div(\s+)class\=\"source\"\>/Us",$data,$explain);
            $r_data = $explain[1][0];
            preg_match_all("/\<p\>\<strong\>(?P<adj>.*)\<\/strong\>\<span\>(?P<name>.*)\<\/span\>\<\/p\>/Us", $r_data, $a_data);
            preg_match_all("/\<span\>(?P<tag>[^\>]+)\:\<a(\s+)href\=\"(.*)\"\>(?P<word>.*)\<\/a\>\<\/span\>/Us", $r_data, $b_data);

            $result = array();
            foreach ($a_data["adj"] as $key => $value) {
                $result[$value] = $a_data["name"][$key];
            }

            $word_b = array();
            foreach ($b_data["tag"] as $key => $value) {
                $word_b[$value] = strip_tags($b_data["word"][$key]);
            }

            $result_data = array("x" => $result,"b" => $word_b);

            return $result_data;
        }


        /**
       * 获取同义词
       * return array(0 => "同义词", 1 => "反义词") 一般为多维数组
       * 
         */

        private function getSynonym(){
            $data = $this -> getContent();
            preg_match_all("/id=\"en\-syn\-ant\"\>(.*)<div(\s+)class\=\"source\">/Us",$data,$synonym);
            $content = $synonym[1][0];
            $data1 = explode("</dl>", $content);
            $result = array();
            $data2 = array();
            foreach ($data1 as $key => $value) {
                preg_match_all("/\<strong\>(?P<adj>.*)\ \;\<\/strong\>\<\/div\>\<div(\s+)class\=\"syn\-ant\-list\"\>\<ul\>(?<content>.*)\<\/ul\>/Us", $value, $r_data);
                $data2[$key]["adj"] = $r_data["adj"];
                $data2[$key]["content"] = $r_data["content"];
            }

            foreach ($data2 as $key => $value) {
                foreach ($value["content"] as $k => $v) {
                    if(!empty($v)){
                        preg_match_all("/\<li\>\<p\>(?P<title>.*)\<\/p\>(?P<value>.*)\<\/li>/Us", $v, $v_data);
                        foreach ($v_data['title'] as $m => $d) {
                            $data = strip_tags(preg_replace("<</a>>"," ", $v_data["value"][$m]));
                            $result[$key][$value["adj"][$k]][$d] = $data;
                        }
                    }
                }
            }
            return $result;
        }

        /**
       * 获取短语词组
       * return array (key => value) 一维或者多维数组
       * 
         */

        private function getPhrase(){
            $num = self::$num;
            $data = $this -> getContent();
            preg_match_all("/id=\"en\-phrase\"\>(.*)\<div class\=\"source\"\>/Us",$data,$phrase);
            $data = explode("</dd>",$phrase[1][0]);
            $data1 = array_slice($data,0,$num);
            $result = array();
            foreach ($data1 as $key => $value) {
                $data2 = explode("</p>", $value);
                $n = count($data2);
                if($n<=3){
                    $result[str_replace(" ","",strip_tags($data2[0]))] = strip_tags($data2[1]);
                }else{
                    $data3 = array_slice($data2,0,$n-1);
                    $data4 = array_slice($data2,0,2);
                    $res = array_diff($data3,$data4);
                    $data5 = array_chunk($res,2);
                    $key_value = trim(str_replace(" ","",strip_tags($data4[0])));
                    $result[$key_value] = strip_tags($data4[1]);
                    foreach ($data5 as $key => $value) {
                        foreach ($value as $k => $v) {
                            $value[$k] = strip_tags($v);
                        }
                        $array = array($result[$key_value],$value);
                        if (array_key_exists($key_value, $result)){
                            $result[$key_value] = $array;
                        }
                    }

                }
            }
            return $result;
        }

        /**
         * 将数组转换为字符串
         *
         * @param  array  $data    数组
         * @param  bool  $isformdata 如果为0,则不使用new_stripslashes处理,可选参数,默认为1
         * @return  string 返回字符串,如果,data为空,则返回空
         */
        private function array2string($data, $isformdata = 1) {
          if($data == '') return '';
          if($isformdata) $data = $this->new_stripslashes($data);
          return addslashes(var_export($data, TRUE));
        }

        /**
         * 返回经stripslashes处理过的字符串或数组
         * @param $string 需要处理的字符串或数组
         * @return mixed
         */
        private function new_stripslashes($string) {
          if(!is_array($string)) return stripslashes($string);
          foreach($string as $key => $val) $string[$key] = $this->new_stripslashes($val);
          return $string;
        }

    }

    // $word = new dict("express");
    // $word ->content();

以上就是本文的全部内容了,非常实用的功能,希望小伙伴们能够喜欢。

 相关文章:
PHP分页显示制作详细讲解
SSH 登录失败:Host key verification failed
获取IMSI
将二进制数据转为16进制以便显示
文件下载
获取IMEI
贪吃蛇
双位运算符
发送邮件
PHP自定义函数获取搜索引擎来源关键字的方法
Java生成UUID
提取后缀名
年的日历图
在Zeus Web Server中安装PHP语言支持
让你成为最历害的git提交人
Yii2汉字转拼音类的实例代码
再谈PHP中单双引号的区别详解
指定应用ID以获取对应的应用名称
Python 2与Python 3版本和编码的对比
php封装的page分页类完整实例