Skymvc是一款简洁、快速、高效的php开发框架,是敏捷开发必备工具。支持多模板机制,一套程序可以同时开发PC端、wap端、和app接口。
下载地址:github下载

4.25 skymvc的蜘蛛库cls_solink.php

发布时间:2016-05-16

skymvc的蜘蛛库cls_solink.php


$this->loadClass("solink",false,false);
$solink= new solink(array("url"=>"http://www.skymvc.com"));
$solink->get_content();
echo $solink->get_title();
print_r($solink->get_link());

附:cls_solink.php

<?php
/*
$solink= new solink(array("url"=>"http://www.skymvc.com"));
$solink->get_content();
 $solink->get_title();
print_r($solink->get_link());

*/
class solink{
	public $content;
	public $url;
	public $dir;
	public $host;//当前域名
	public $scheme;
	public $selfsite;
	public $domain;//主域名
	public function __construct($config=array()){
		$this->url=isset($config['url'])?$config['url']:"";//要采集的url
		$this->selfsite=isset($config['selfsite'])?$config['selfsite']:0;//1 只采集自己当前域名的站 0.采集主域名及子域名 2.采集所有  
		$this->parseurl();
	}
	public function set($config=array()){
		$this->url=isset($config['url'])?$config['url']:"";//要采集的url
		$this->selfsite=isset($config['selfsite'])?$config['selfsite']:2;//1 只采集自己当前域名的站 0.采集主域名及子域名 2.采集所有  
		$this->parseurl();
	}
	public function get_content(){
	 	$this->content=$this->toutf8($this->curl_get_contents($this->url));
	 
	}
	public function get_link(){
		preg_match_all("/<a[^>]*href=[\"']([^\"']*)[\"']/iUs",$this->content,$a);
		if(isset($a[1])){
			foreach($a[1] as $v){
				$u=$this->builtlink($v);
				$u && $urls[]=$u;
			}
			return $urls;
		}
		return array();
	}
	public function get_title(){
		preg_match("/<title>(.*)<\/title>/iUs",$this->content,$a);
		if(isset($a[1])) return $a[1];
		return false;
	}
	public function parseurl(){
		$arr=parse_url($this->url);
		$this->host=$arr['host'];
		$this->scheme=$arr['scheme'];
		preg_match("/([\w]+\.(com|cn|net|org|cc|info|me|co|tv|name|tel|so)(\.cn)?)/i",$this->host,$c);
		$this->domain=$c[1];
		$this->dir=dirname($this->url); 
	}
	public function parsedomain($url){
		$d=array(
			".com",
			".cn",
			".com.cn",
			".net"
		);
	}
	public function builtlink($url){
		if(empty($url)) return "";
		if($this->selfsite==1){
			$a=parse_url($url);
			if($a['host'] && $a['host']!=$this->host) return "";
		}elseif($this->selfsite==0){
			$a=parse_url($url);
			//如果不是主域名
			if($a['host'] && strpos($url,$this->domain)===false) return "";
		}
		if($url{0}=="/"){
			return $this->scheme."://".$this->host.$url;
		}elseif(preg_match("/^http/i",$url)){
			return $url;
		}elseif(preg_match("/^javascript/i",$url)){
			return "";
		}elseif($url{0}=="#"){
			return "";
		}else{
			return $this->scheme."://".$this->host."/".$this->dir."/".$url;
		}
	}
	public function curl_get_contents($url){
		$ch = curl_init();
		curl_setopt($ch, CURLOPT_URL, $url);
		curl_setopt($ch, CURLOPT_HEADER, 0);
		curl_setopt($ch,CURLOPT_TIMEOUT,30);
		curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
		$content=curl_exec($ch);
		curl_close($ch);
		return $content;
	}
	public function toutf8($str){
		$ico=array("gbk","utf-8","'ASCII'");
		foreach($ico as $c){
			if($str===iconv("utf-8","$c//IGNORE",iconv($c,"utf-8//IGNORE",$str))){
				if($c=='utf-8'){
				return $str;
				}
				return iconv($c,"utf-8",$str);
			}
		}
		return $str; 
	}
}
?>