<?
// -------------PHPDIG-1.0-----------
// Here are functions used by phpdig
// Separed in sections
// Antoine Bajolet
bajolet@toiletoine.net
//-------------STRING FUNCTIONS
//=================================================
//converts an iso date to an mysql date
function http_to_sqldate($date)
{
global $month_names;
if (eregi('(([a-z]{3})\, ([0-9]{1,2}) ([a-z]+) ([0-9]{4}) ([0-9:]{8}) ([a-z]+))',$date,$regs))
{
$month = sprintf('%02d',$month_names[strtolower($regs[4])]);
$year = sprintf('%04d',$regs[5]);
$day = sprintf('%02d',$regs[3]);
$hour = sprintf('%06d',str_replace(':','',$regs[6]));
return "$year$month$day$hour";
}
}
//=================================================
//returns a localized string
function msg($string='')
{
global $phpdig_mess;
return nl2br($phpdig_mess[$string]);
}
//print a localized string
function pmsg($string='')
{
global $phpdig_mess;
print nl2br($phpdig_mess[$string]);
}
//=================================================
//load the common words in an array
function common_words($file='')
{
$lines = @file($file);
if (is_array($lines))
{
while (list($id,$word) = each($lines))
$common[trim($word)] = 1;
}
else
$common['aaaa'] = 1;
return $common;
}
//=================================================
//highlight a string part
function highlight($word="",$string="")
{
if ($word)
return @eregi_replace("($word)","<B>\\1</B>",$string);
else
return $result;
}
//=================================================
//replace all characters with an accent
function stripaccents($chaine)
{
return( strtr( $chaine,
"ÀÁÂÃÄÅàáâãäåÒÓÔÕÖØòóôõöøÈÉÊËèéêëÇçÌÍÎÏìíîïÙÚÛÜùúû üÿÑñ",
"AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuu uyNn" ) );
}
//=================================================
//epure a string from all non alnum words (words can contain & and _ character)
function epure_text($text,$min_word_length=2)
{
$text = stripaccents(strtolower ($text));
$text = ereg_replace("[[:blank:]][0-9]+[[:blank:]]"," ",ereg_replace("[^[:alnum:]_&]+"," ",$text));
$text = ereg_replace("[[:blank:]][^ ]{1,$min_word_length}[[:blank:]]"," "," ".$text." ");
return trim(ereg_replace("[[:blank:]]+"," ",$text));
}
//=================================================
//advanced striptags function.
//returns text and title
function html_to_plain_text($text)
{
//htmlentities
global $spec;
//replace blank characters by spaces
$text = ereg_replace("[\r\n\t]+"," ",$text);
//extracts title
if ( eregi("<title>([^<>]*)</title>",$text,$regs) )
$title = $regs[1];
else
$title = "";
//delete content of head, script, and style tags
$text = eregi_replace("<head[^<>]*>.*</head>"," ",$text);
$text = eregi_replace("<script[^>]*>.*</script>"," ",$text);
$text = eregi_replace("<style[^>]*>.*</style>"," ",$text);
$text = eregi_replace("(<[a-z0-9 ]+>)","\\1 ",eregi_replace("(</[a-z0-9 ]+>)","\\1 ",$text));
//tries to replace htmlentities by ascii equivalent
reset ($spec);
while ($char = each($spec))
{
$text = eregi_replace ($char[0]."[;]?",$char[1],$text);
$title = eregi_replace ($char[0]."[;]?",$char[1],$title);
}
$text = ereg_replace('&#([0-9]+);',chr('\1').' ',$text);
//replace blank characters by spaces
$text = ereg_replace("[\r\n\t]+"," ",$text);
$text = eregi_replace("--|[{}();\"]+"," ",eregi_replace("</[a-z0-9]+>"," ",$text));
//replace any group of blank characters by an unique space
$text = ereg_replace("[[:blank:]]+"," ",strip_tags($text));
$retour['content'] = $text;
$retour['title'] = $title;
return $retour;
}
//=================================================
//purify urls from relative components like ./ or ../ and return an array
function url_purify($eval)
{
//delete special links
if (eregi("[/]?mailto
[/]?javascript
[/]?news:",$eval))
return -1;
$url = @parse_url($eval);
$path = str_replace("./","",ereg_replace("^[.]/","",ereg_replace("^[.]{2}/.*",'NOMATCH',ereg_replace("[^/]*/[.]{2}/","",ereg_replace("^[.]/","",ereg_replace("/+","/",$url['path']))))));
if (eregi('([^/]+)$',$path,$regs))
{
$file = $regs[1];
$path = str_replace($file,"",$path);
}
$retour['path'] = ereg_replace('(.*[^/])/?$','\\1/',ereg_replace('^/(.*)','\\1',ereg_replace("/+","/",$path)));
if ($url['query'])
{
$file .= "?".$url['query'];
$retour['as_query'] = 1;
}
$retour['file'] = $file;
//path outside site tree
if ($retour['path'] == "NOMATCH" or ereg("^redir[.]php3.*",$file))
return -1;
return $retour;
}
//-------------HTTP FUNCTIONS
//Test presence and type of an url
function test_url($url,$mode='simple')
{
global $phpdig_version;
$components = parse_url($url);
$host = $components["host"];
$port = (int)$components["port"];
$path = $components["path"];
$query = $components["query"];
if (!$port)
{
$cport = 80;
}
$fp = fsockopen($host,$cport);
if ($port)
$port = ":".$port;
else
$port ="";
if (!$fp) {
//host domain not found
$status = "NOHOST";
}
else {
if ($query)
$path .= "?".$query;
//small get
/*$req =
"GET $path HTTP/1.1
Host: $host$port
";
*/
$req = "
";
//complete get
$request =
"GET $path HTTP/1.1
Host: $host$port
Accept: */*
Accept-Charset: iso-8859-1
Accept-Encoding: identity
User-Agent: PhpDig/$phpdig_version (PHP; MySql)
";
fputs($fp,$request);
$answer = fgets($fp,4096);
//test return code
if (ereg("([2-3])[0-9]{2}", $answer,$regs) )
{
$code = $regs[1];
while ($answer)
{
if ($req1)
{
$cur_req = $req1;
unset($req1);
}
else
$cur_req = $req;
fputs($fp,$cur_req);
$answer = fgets($fp,4096);
//parse header location
if (ereg("Location: *(.*)",$answer,$regs) && $code == 3)
{
$redirs ++;
if ($redirs > 4)
{
$answer = "";
$status = "LOOP";
}
$path = $regs[1];
$newurl = parse_url($path);
if (!$newurl['host'] || $host == $newurl['host'])
$req1 = "GET ".$newurl["path"]." HTTP/1.1
Host: $host$port
";
}
//Parse content-type header
elseif (eregi("Content-Type: *(text/[a-z]*)",$answer,$regs))
{
if ($regs[1] == "text/html")
{
$status = "HTML";
}
elseif ($regs[1] == "text/")
{
$boucle = 0;
while($boucle < 3)
{
fputs($fp,$req);
$answer = fgets($fp,4096);
//test presence of <html> tag at the begining
if (eregi("<html",$answer))
{
$status = "HTML";
$boucle = 3;
}
$boucle++;
}
}
elseif ($regs[1] == "text/plain")
{
eregi('\.([a-z0-9]{1,4})$',$path,$extregs);
// extension txt or other ?
if (is_array($extregs) && !eregi('txt',$extregs[1]))
$status = "TEXT";
else
$status = "PLAINTEXT";
}
else
{
$status = "TEXT";
}
}
elseif (eregi('Last-Modified: *([a-z0-9,: ]+)',$answer,$regs))
{
//search last-modified header
$date = $regs[1];
}
if (!eregi('[a-z0-9]+',$answer))
$answer = "";
}
}
else
{
//errors 400 and 500
$status = "NOFILE";
}
fclose($fp);
}
//returns variable or array
if ($mode == 'date')
{
$return['status'] = $status;
$return['lm_date'] = $date;
return $return;
}
else
return $status;
}
//=================================================
//retrieve links from an url
function explore($url,$path="",$file ="")
{
$index = 0;
//tests the nofollow directive in robots tags
if ((test_robots_tags(@get_meta_tags($url.$path.$file )) & 5) == 0)
$file_content = @file($url.$path.$file);
if (is_array($file_content))
{
while (list($n,$eval) = each($file_content))
{
//search hrefs and frames src
while (eregi("(<frame[^>]*src|href)[[:blank:]]*=[[:blank:]]*[\'\"]?((([[a-z]{3,5}://)+(([.a-zA-Z0-9-])+(:[0-9]+)*))*([:%/?=&;\\,._a-zA-Z0-9-]*))[#\'\" ]?",$eval,$regs))
{
$eval = str_replace($regs[0],"",$eval);
//test no host or same than site
if ($regs[5] == "" || $url == 'http://'.$regs[5].'/')
{
if (substr($regs[8],0,1) == "/")
$links[$index] = url_purify($regs[8]);
else
$links[$index] = url_purify($path.$regs[8]);
if (is_array($links[$index]))
$index++;
else
unset($links[$index]);
}
}
}
return $links;
}
else
return -1;
}
//=================================================
//test a link, search if is a file or dir, exclude robots.txt directives
function detect_dir_html($link,$exclude='')
{
$test = test_url($link['url'].$link['path'].$link['file']);
//file
if ($test == 'HTML' or $test == 'PLAINTEXT')
$link['ok'] = 1;
//dir
elseif (test_url($link['url'].$link['path'].$link['file'].'/') == "HTML")
{
$link['path'] = ereg_replace ('/+$','/',$link['path'].$link['file'].'/');
$link['file'] = "";
$link['ok'] = 1;
}
//none
else
$link['ok'] = 0;
//test the exclude with robots.txt
if (test_robots($exclude,$link['path']) == 1 or $exclude['@ALL@'] == 1)
$link['ok'] = 0;
return $link;
}
//=================================================
//search robots.txt for a site
function test_robots_txt($site) //don't forget the end backslash
{
if (test_url($site.'robots.txt') == 'PLAINTEXT')
{
$robots = file($site.'robots.txt');
while (list($id,$line) = each($robots))
{
if (ereg('^user-agent:[ ]*([a-z0-9*]+)',strtolower($line),$regs))
$user_agent = $regs[1];
if (eregi('^disallow:[ ]*(/([a-z0-9_/-]*))',$line,$regs))
{
if (!ereg('/$',$regs[2]))
$regs[2] .= '/';
if ($regs[1] == '/')
$exclude[$user_agent]['@ALL@'] = 1;
else
$exclude[$user_agent][$regs[2]] = 1;
}
}
if (is_array($exclude['phpdig']))
return $exclude['phpdig'];
elseif (is_array($exclude['*']))
return $exclude['*'];
}
$exclude['@NONE@'] = 1;
return $exclude;
}
//=================================================
function test_robots($exclude,$path)
{
if (ereg('^[a-z0-9_/-]+/$',$path,$regs))
{
while (list($path_exclude) = each($exclude))
{
if (ereg('^'.$path_exclude,$path))
$result = 1;
}
return $result;
}
}
//=================================================
function test_robots_tags($tags)
{
if (is_array($tags))
{
while (list($id,$content) = each($tags))
{
if (eregi('robots',$id))
{
$directive = 0;
if (eregi('nofollow',$content))
$directive += 1;
if (eregi('noindex',$content))
$directive += 2;
if (eregi('none',$content))
$directive += 4;
//test the bitwise return > 0 : & 5 nofollow, & 6 noindex.
return $directive;
}
}
}
}
?>