<?php
class d_crawler {
    private $url;
    private $max_depth;
	private $max_page;
	private $max_thread;
    private $host;
    private $http_auth = false;
    private $user;
    private $pass;
	private $callback = false;
    private $page = array();
    private $filter = array('include' => array(), 'exclude' => array());
			
    public function __construct($url, $max_depth = 5, $max_page = 100, $max_thread = 10) {
        if (strpos($url, 'http') === false) {
			$url = 'http://' . $url;
		}
			
        $url_info = parse_url($url);
		
		$this->url = $url;
        $this->host = $url_info['host'];
		$this->max_depth = $max_depth;
		$this->max_page = $max_page;
		$this->max_thread = $max_thread;
    }
	
    public function setHttpAuth($user, $pass) {
        $this->http_auth = true;
        $this->user = $user;
        $this->pass = $pass;
    }

    public function addFilter($path, $type = 'include') {
        $this->filter[$type][] = $path;
    }
	
	public function setCallback($callback) {
        $this->callback = $callback;
    }

    public function run() {
		$links = array();
		
		$links[] = $this->url;
				
		for ($depth = 0; $depth <= $this->max_depth; $depth++) {
			$urls = array();
			
			if ($links && (count($this->page) < $this->max_page)) {
				foreach ($links as $url) {
					if ($this->validateURL($url)) {
						$urls[] = $url;
						
						$this->page[$url] = array(); 
					}
				}
			}
			
			$links = array();	
					
			while ($urls) {
				$cmh = curl_multi_init();
				
				$thread = 0;
				
				while ($urls && ($thread <= $this->max_thread)) {
					$url = array_shift($urls);
					
					$ch = curl_init($url);
					
					if ($this->http_auth) {
						curl_setopt($ch, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
						curl_setopt($ch, CURLOPT_USERPWD, $this->user . ":" . $this->pass);
					}
		
					curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
					
					curl_multi_add_handle($cmh, $ch);
					
					$thread++;
				}
				
				$active = null;
				
				do {
					$mrc = curl_multi_exec($cmh, $active);
				} while ($mrc == CURLM_CALL_MULTI_PERFORM);

				while ($active && ($mrc == CURLM_OK)) {
					if (curl_multi_select($cmh) == -1) {
						usleep(1);
					}
					
					do {
						$mrc = curl_multi_exec($cmh, $active);
						$info = curl_multi_info_read($cmh);
							
						if ($info['msg'] == CURLMSG_DONE) {
							$ch = $info['handle'];
							
							$html = curl_multi_getcontent($ch);
							$info = curl_getinfo($ch);
													
							$this->page[$info['url']] = array(
								'url'			=> $info['url'],
								'http_code' 	=> $info['http_code'],
								'redirect_url'  => $info['redirect_url'],
								'time'	   		=> $info['total_time'], 
								'html'  		=> $html,
								'depth'			=> $depth
							);
							
							if ($this->callback) {
								call_user_func($this->callback, array_filter($this->page));
							}
							
							if ($info['redirect_url']) {
								$links = array_merge($links, array($info['redirect_url']));
							}
							
							$links = array_merge($links, $this->getLinks($html, $info['url']));
							
							curl_multi_remove_handle($cmh, $ch);
							
							if ($urls) {
								$url = array_shift($urls);
								
								$ch = curl_init($url);
					
								if ($this->http_auth) {
									curl_setopt($ch, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
									curl_setopt($ch, CURLOPT_USERPWD, $this->user . ":" . $this->pass);
								}
		
								curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
					
								curl_multi_add_handle($cmh, $ch);
							} else {
								curl_close($ch);
							}
						}
					} while ($mrc == CURLM_CALL_MULTI_PERFORM);
				}
				
				curl_multi_close($cmh);
			}
		}
				
		return array_filter($this->page);
    }
		
    private function getLinks($html, $url) {
        $dom = new DOMDocument('1.0');
        @$dom->loadHTML($html);
        $anchors = $dom->getElementsByTagName('a');
		$links = array();

        foreach ($anchors as $anchor) {
            $href = $anchor->getAttribute('href');
			
            if (strpos($href, 'http') === false) {
                $path = '/' . ltrim($href, '/');
				
                $url_info = $this->getURLInfo($href);
					
                $href = $url_info['scheme'] . $url_info['user'] . $url_info['pass'] . $url_info['host'] . $url_info['port'] . $path . $url_info['query'];
            } else {
				$url_info = $this->getURLInfo($href);
								
				$href = $url_info['scheme'] . $url_info['user'] . $url_info['pass'] . $url_info['host'] . $url_info['port'] . $url_info['path'] . $url_info['query'];
			}
			
			$links[] = $href;
        }
		
		return $links;
    }

	private function getURLInfo($url) {						
		$url_info = parse_url(str_replace('&amp;', '&', $url));
		
		$url_info['scheme'] = isset($url_info['scheme']) ? $url_info['scheme'] . '://' : '';
		$url_info['user'] = isset($url_info['user']) ? $url_info['user'] : '';
		$url_info['pass'] = isset($url_info['pass']) ? ':' . $url_info['pass']  : '';
		$url_info['pass'] = ($url_info['user'] || $url_info['pass']) ? $url_info['pass'] . '@' : ''; 
		$url_info['host'] = isset($url_info['host']) ? $url_info['host'] : '';
		$url_info['port'] = isset($url_info['port']) ? ':' . $url_info['port'] : '';
		$url_info['path'] = isset($url_info['path']) ? $url_info['path'] : '';		
		
		$url_info['data'] = array();
		
		if (isset($url_info['query'])) {
			parse_str($url_info['query'], $url_info['data']);
		}
		
		$url_info['query'] = isset($url_info['query']) ? '?' . $url_info['query'] : '';
		$url_info['fragment'] = isset($url_info['fragment']) ? '#' . $url_info['fragment'] : '';
						
		return $url_info;
	}

    private function validateURL($url) {
        if ((strpos($url, $this->host) === false) || isset($this->page[$url]) || (count($this->page) >= $this->max_page)) {
            return false;
        }
		
		foreach ($this->filter['include'] as $include) {
            if (strpos($url, $include) === false) {
                return false;
            }
        }
		
        foreach ($this->filter['exclude'] as $exclude) {
            if (strpos($url, $exclude) !== false) {
                return false;
            }
        }
		
        return true;
    }
}
?>