<?php
namespace imcat;

// extCrawl
class extCrawl{

    
# 取多个选择器(<pqs1>,<pqs2>)中一个的 值/属性
    # 用于一个列表中,获取某一项值可能有两种或多中规范
    
static function getDomVals($dom$pqs$attr 'text')
    {
        if (empty(
$pqs)) {
            return 
'';
        }
        if (
strpos($pqs',') > 0) {
            
$tab explode(','$pqs);
            for (
$i 0$i count($tab); $i++) {
                
$res self::getDomOne($dom$tab[$i], $attr);
                if (
$res) {
                    return 
$res;
                }
            }
        } else {
            return 
self::getDomOne($dom$pqs$attr);
        }
        return 
'';
    }

    
# 取一个(<pqs>)选择器的 值/属性
    
static function getDomOne($dom$pqs$attr 'text')
    {
        
$e pq($dom)->find($pqs);
        if (
$attr == 'text') {
            return 
pq($e)->text();
        } elseif (
$attr == 'html') {
            return 
pq($e)->html();
        } else {
            return 
pq($e)->attr($attr);
        }
    }

    
// src: 链接url
    // base1: 当前页url
    // bext: 额外附加url(手动设置)
    
static function urlJoin($src$base1$bext=''){
        
$base $bext ?: $base1;
        
$binfo parse_url($base);
        if(
strpos($src,'://')>0){ // 完整地址
            
return $src// `?rul=ftp://d/f.htm` ?这种地址先不考虑吧
        
}elseif(substr($src,0,2)=='//'){ // `//`开头
            
return $binfo['scheme'].':'.$src;
        }elseif(
substr($src,0,1)=='/'){ // `/`开头 `$bext.$src` ?两个//可修改bext参数解决
            
return $bext $bext.$src : ($binfo['scheme'].'://'.$binfo['host'].$src);
        }else{ 
// `./`, `../`, `file.ext` 开头(可能有多个)
            
$base = empty($binfo['scheme']) ? '' $binfo['scheme'].'://'.$binfo['host'];
            if(isset(
$binfo['path'])){
                
$path $binfo['path'].(substr($binfo['path'],-1)=='/' 'file.ext' '');
            }else{
                
$path '/';
            }
            
$url substr($src,0,1)=='.' $path.$src "$path./$src";
            
$url preg_replace("/\/([^\/]+)?(\w+)\.\//i""/"$url1); // /xxx./
            
$url preg_replace("/\/([^\/]+)\/([^\/]+)?(\w+)\.\.\//i""/"$url1); // /xxx/yyy../
            
$url preg_replace("/\/([^\/]+)?(\w+)\.\.\//i""/"$url1); // /xxx../
            
$cnt 0// 万一有死循环呢?
            
while(strpos($url,'../')>0) { // /../
                
$url preg_replace("/\/([^\/]+)\/\.\.\//i""/"$url1); // /xxx/../
                
$url preg_replace("/^(\/\.\.\/)+/i""/"$url); // /../ 开头
                
$cnt++; if($cnt>12) break;
            }
            return 
$base.$url;
        }
    }

    
// 
    
static function testUrls(){

        
$ta = [
            
'dir0/dir1/dir2/dir3/',
            
'/dir0/dir1/dir2/dir3/',
            
'/dir0/dir1/dir2/dir3/file4',
            
'/dir0/dir1/dir2/dir3/file4.ext',
        ];
        
$tb = [
            
'./dira/fileb.ext',
            
'../dira/fileb.ext',
            
'../../dira/fileb.ext',
            
'./../../../dira/fileb.ext',
        ];

        
$base 'http://sub1.domain.com/d1/d2/d3/f4.ext';
        
$bex1 'http://sub2.domain.com/d1/d2/d3/';
        
$bex2 'http://sub3.domain.com/';

        
$tab[] = [
            
'http://sub.domain.com/aa/bb/cc1.ex1',
            
$base''
        
];
        
$tab[] = [
            
'//sub.domain.com/aa/bb/cc2.ex1',
            
$base''
        
];
        
$tab[] = [
            
'/aa/bb/cc3.ex1',
            
$base''
        
];
        
$tab[] = [
            
'/aa/bb/cc3.ex1',
            
''substr($bex1,0,strlen($bex1)-1)
        ];
        
$tab[] = [
            
'/aa/bb/cc3.ex1',
            
''substr($bex2,0,strlen($bex2)-1)
        ];
        foreach (
$tab as $ki => $row) {
            
$url Tools::urlJoin($row[0], $row[1], $row[2]);
            echo 
"org: $row[0],<br> base: ".($row[1] ?: '(null)').",<br> bext: ".($row[2] ?: '(null)').
                <br> -=> 
$url<br><br>\n";
        } echo 
"<hr>\n";

        
$exts1 = [$base$bex1];
        foreach (
$exts1 as $urlb) {
            foreach (
$tb as $path) {
                
$url Tools::urlJoin($path$urlb'');
                echo 
"org: $path,<br> base: $urlb,<br> bext: (null) 
                <br> -=> 
$url<br><br>\n";
            }
        } echo 
"<hr>\n";

        
$exts2 = [$bex1$bex2];
        foreach (
$exts2 as $urlb) {
            foreach (
$tb as $path) {
                
$url self::urlJoin($path''$urlb);
                echo 
"org: $path,<br> base: (null),<br> bext: $urlb 
                <br> -=> 
$url<br><br>\n";
            }
        } echo 
"<hr>\n";

    }

}