<?php
    
/////////////////////////////////////////////////////////
    //
    //  AHNAR CSA3004 APT University of Malta
    //  Ian Bugeja 2005
    //
    //////////////////////////////////////////////////////////
    
define("AHNARLOC""http://ahnar.iannet.org/ahnar.php?ahnarurl=");

    class 
HTML_Parser
    
{

        public 
$stshlink ''//<link stylesheet tag
        
private $domain '';  //page domain
        
public $conframes false;  //if page is a frames page
        
public $atagcount 0;      //number of <a> tags
        
public $words '';         //page contents excluding tags
        
public $ataglist;           //position of <a> tags
        
public $pagetitle;          //page title
        
public $middleatext;        //contains text in middle of a tag
        
public $docutype;           //!DOCTYPE tag of html page
        
public $bodytag;            //<body> tag
        
private $bold false;
        public 
$adest;              //<a> tags destination


        
public function __construct($stlink$dom)
        {
                
$this->stshlink $stlink;
                
$siz strlen($dom);
                if (
$dom[$siz-1] == '/'$dom substr($dom0$siz-1);
                
$this->domain $dom;
                
$this->atagcount 0;
                
$this->words '';
                
$this->pagetitle '';
                
$this->docutype '';
                
$this->bodytag '';
                
$this->bold false;
                
$this->ataglist = array();
                
$this->middleatext = array();
        }


        private function 
id_tag($tag)    //identify tag (removing < and >)
        
{
                
$len strlen($tag);
                
$tag[0] = chr(32);
                
$tag[$len-1] = chr(32);
                
$tag str_replace("\r"" "$tag);
                
$tag str_replace("\n"" "$tag);
                
$tag trim($tag);

                
$tag strtolower($tag);
                
$pos strpos($tag' ');

                if (
$pos === false)
                {
                        return 
$tag;
                }
                else
                {
                        return 
substr($tag0$pos);
                }
        }

        public function 
html_parse($code)
        {
            
$newcontents '';
            
$subcontents '';
            
$boldcontents '';
            
$pos = -1;
            
$len strlen($code);
            
$scriptparse false;
            
$titleparse false;
            
$midaflag 0;
            
$midatext '';
            
$aname '';
            while (
$pos $len)
            {
                
$pos++;
                
//if (($code{$pos} == '<') && (($scriptparse == false) || (($code[$pos+2] == 's') || ($code[$pos+2] == 'S'))))
                
if ($code[$pos] == '<')
                {
                    
$tpos $pos;
                    while ((
$code[$tpos] != '>') && ($tpos $len)) //find end tag marker
                    
{
                        
$tpos++;
                        if (
$code[$tpos] == '<') { $tpos--; break; } //in case of incorrect closed tag...
                    
}
                    
$tag substr($code$pos$tpos-$pos+1); //get tag
                    //identify tag type
                    
$idtag $this->id_tag($tag);
                    if (
$idtag[0] == '/')  //closing tag
                    
{
                        
$subcontents .= ' ';
                        if (
$this->bold == true)
                        {
                                
$subcontents .= ' '.$boldcontents ' ' $boldcontents;
                                
$this->bold false;
                        }
                    }
                    else
                    {
                        
$bt $this->checkBoldCSS($tag);
                        if (
$bt !== false$this->bold $bt;
                        if (
$this->bold == true$boldcontents '';
                    }

                    switch (
$idtag)
                    {
                        case 
'!doctype' $this->docutype $tag; break;
                        case 
'link'$this->stshlink .= $this->modify_src($tag'href'false); break;
                        case 
'html': break;
                        case 
'/html': break;
                        case 
'title':  if ($this->pagetitle == ""$titleparse true$newcontents .= "<!--"; break;
                        case 
'/title'$titleparse false$newcontents .= "-->"; break;
                        case 
'head' : break;
                        case 
'/head' : break;
                        case 
'meta' $newcontents .= "<!--ahnar rem-meta-->";
                                        if (
stripos($tag"refresh") !== false//not to remove meta refresh tag
                                        
{
                                                
$metaext1 stripos($tag"url"); echo $metaext1.'~';
                                                
$metaext2 stripos($tag"content");  echo $metaext2.'~';
                                                
$metaurl substr($tag$metaext2+8$metaext1+4-($metaext2+8)); echo $metaurl;
                                                
$tag str_replace($metaurl""$tag);
                                                
$tag $this->modify_src($tag"content"true);
                                                
$tag str_ireplace("content=""content=".$metaurl$tag);
                                                
$newcontents .= $tag;
                                        }
                                        break;
                        case 
'/meta' : break;
                        case 
'body' :   //$tag = $this->remAttrib($tag, 'onLoad');
                                        
if ($this->getValue($tag'background') != ""$tag $this->modify_src($tag'background'false);
                                      
$this->bodytag $tag;
                                      break;
                        case 
'/body' : break;
                        case 
'frame' :  $this->conframes true;  //mark page as a Framed page (parsed differently)
                                        
$tag $this->modify_src($tag'src'true);
                                        
$newcontents .= $tag; break;
                        case 
'iframe' $tag $this->modify_src($tag'src'true);
                                        
$newcontents .= $tag; break;
                        case 
'a' : if (count($this->ataglist) != count($this->middleatext))
                                   {
                                        echo 
count($this->ataglist).'|'.count($this->middleatext)."ERRORRRRRRRRRRRRRRR\r\n";
                                        while (
count($this->ataglist) > count($this->middleatext))
                                              
$this->middleatext[] = " ";
                                        while (
count($this->ataglist) < count($this->middleatext))
                                              
$this->ataglist[] = 0;
                                   }
                                   
//if (count($this->ataglist) != count($this->middleatext)) echo "ERRORO BIGG\r\n";
                                   
$this->atagcount++;
                                   
$dest $this->getValue($tag"href");
                                   
$this->adest[] = $dest;
                                   
$tag $this->modify_src($tag'href'true);
                                   
$tag $this->modify_src($tag'src'false);
                                   
$midaflag 0;
                                   
$aname $this->getValue($tag"name");
                                   if(
$aname != "")
                                   {        
//echo $aname." ";
                                        
$midaflag 3//name atag (anchor position)
                                   
}
                                   elseif (
$dest == "")
                                   {
                                        
$midaflag 7;
                                   }
                                   elseif (
stripos($tag'mailto'1) === false)  //NOT EMAIL
                                   
{      //echo '|'.$dest.'|';
                                        
if ($this->idDestType($dest) < 50)
                                        {
                                                
$midaflag 1$midatext '';  //normal atag
                                        
}
                                        else
                                                
$midaflag 4;
                                   }
                                   else
                                        
$midaflag 2//email
                                   
$this->ataglist[] = strlen($newcontents); //position of 'a' tag to know where abouts...
                                   
$newcontents .= $tag;  //echo $tag.$dest.$midaflag.'|';
                                   
break;

                        case 
'/a' :     $newcontents .= $tag;
                                        switch (
$midaflag)
                                        {
                                                case 
$this->middleatext[] = trim(str_replace("&nbsp;"" "strtolower($midatext))); break;
                                                case 
$this->middleatext[] = 'email';
                                                case 
$this->middleatext[] = 'achname-'.$aname$aname ''; break;
                                                case 
$this->middleatext[] = 'applic'; break;
                                                case 
$this->middleatext[] = 'img'; break;
                                                default : 
$this->middleatext[] = 'ahnoo'; break;
                                        }
                                        
$midaflag 0;
                                        break;

                        case 
'img' :    $tag $this->modify_src($tag'src'false);
                                        
$newcontents .= $tag;  //echo $tag;
                                        
$subcontents .= " ".$this->getValue($tag"alt"); //add alt to text (used for keyword count)
                                        
if ($midaflag 0$midaflag 5//notify image
                                        
break;
                        case 
'area' :   $tag $this->modify_src($tag'href'true);
                                        
$newcontents .= $tag;
                                        break;
                        case 
'script' $scriptparse true;
                                        
$tag $this->modify_src($tag'src'false);
                                        
$newcontents .= $tag;

                                        
$pos $tpos+1;
                                        while (
true)
                                        {
                                                
$tpos++;
                                                if ((
$code[$tpos] == '<') && ($code[$tpos+1] == '/') && (($code[$tpos+2] == 's') || ($code[$tpos+2] == 'S')) && (($code[$tpos+3] == 'c') || ($code[$tpos+3] == 'C')))
                                                        break;
                                        }
                                        
$tpos--;
                                        
$tag substr($code$pos$tpos-$pos+1);
                                        
$newcontents .= $tag;

                                        break;
                        case 
'/script' $scriptparse false;
                                        
$newcontents .= $tag; break;
                        case 
'style' :  $scriptparse true;
                                        
$newcontents .= $tag; break;
                        case 
'/style' $scriptparse false;
                                        
$newcontents .= $tag; break;
                        case 
'table' : if ($this->getValue($tag'background') != ""$tag $this->modify_src($tag'background'false);
                                       
$newcontents .= $tag; break;
                        case 
'td' : if ($this->getValue($tag'background') != ""$tag $this->modify_src($tag'background'false);
                                    
$newcontents .= $tag; break;
                        case 
'droplet' $tag $this->modify_src($tag'src'true);
                                         
$newcontents .= $tag; break;
                        case 
'form' $tag $this->modify_src($tag'action'true);
                                      
$ax strpos($tag"ahnarurl="0);
                                      
$ax2 strpos($tag"\""$ax); //add a hidden field to send url to script
                                      
if ($ax2 <= 0$ax2 strpos($tag" "$ax); if ($ax2 <= 0$ax2 strpos($tag">"$ax);
                                      
$actionsrc substr($tag,$ax+9$ax2-$ax-9);
                                      
$newcontents .= $tag; break;
                        case 
'/form' $newcontents .= "<input type='hidden' name='ahnarurl' value='$actionsrc'></form>"; break;

                        case 
'b' :
                        case 
'em' :
                        case 
'strong' :
                        case 
'h1' $this->bold true;
                                    
$newcontents .= $tag;
                                    
$boldcontents ''; break;

                        case 
'param' :  $paramvalue strtolower($this->getValue($tag"name"));
                                        if (
$paramvalue == "movie")
                                                
$tag $this->modify_src($tag'value'false);
                                       
$newcontents .= $tag; break;
                        case 
'embed' $tag $this->modify_src($tag'src'false);
                                        
$newcontents .= $tag; break;
                        case 
'applet' : if (stripos($tag'codebase') === false)
                                                
$tag str_ireplace("<applet ""<applet codebase=\".\" "$tag);
                                        
$tag $this->modify_src($tag'codebase'false);
                                        
$newcontents .= $tag; break;
                        
//Tags not above just add unmodified
                        
default : $newcontents .= $tag; break;
                    }
                    
////
                    
$pos $tpos;
                }
                else
                {
                    
$newcontents .= $code[$pos];    //text not consisting of a tag (body text)

                    
if (ord($code[$pos]) > 30)
                    if (
$scriptparse == false$subcontents .= $code[$pos]; //subcontents (used for page keywords)

                    
if ($titleparse$this->pagetitle .= $code[$pos];  //page title

                    
if ($midaflag == 1$midatext .= $code[$pos]; //text between <a> tag ie actual link

                    
if ($this->bold == true$boldcontents .= $code[$pos]; //Bold text
                
}
            }
              
//echo $subcontents;
            
$subcontents str_replace("&"" &"$subcontents);
            
$subcontents str_replace(";""; "$subcontents);
            
$subcontents str_replace("&nbsp;"" "$subcontents);
            
$this->words str_replace("  "" "$subcontents);
            return 
$newcontents;
        }


        private function 
modify_src($imgtag$attrib$ahnr)
        {
                
$pos 1;
                
$imgtag str_replace("\r"" "$imgtag); $imgtag str_replace("\n"" "$imgtag); //remove new line
                
$len strlen($imgtag);
                
$newtag $imgtag;
                while ((
$pos !== false) && ($pos $len))
                {
                        
$pos stripos($imgtag' '.$attrib$pos+1);
                        if (
$pos !== false)
                        {
                                
$qpos strpos($imgtag'='$pos+1);
                                
$i $qpos;
                                do { 
$i++; } while (($imgtag[$i] == chr(32)) || ($imgtag[$i] == chr(34)) || ($imgtag[$i] == chr(39)) );

                                if (
$attrib == 'href')
                                {
                                        if (
stripos($imgtag'mailto'$i) !== false) return $imgtag//case of mailto href
                                        
if (stripos($imgtag'javascript:'$i) !== false) return $imgtag//case of javascript href
                                
}

                                
$httppos stripos($imgtag'http://'$i);   if ($httppos-$i 5$httppos=0;
                                
$httpspos stripos($imgtag'https://'$i);  if ($httpspos-$i 5$httpspos=0;

                                if (
$imgtag[$i] == '#')
                                        
$newtag $imgtag//instead of return
                                
elseif (($httppos 0) || ($httpspos 0))
                                {
                                        
$newtag ''//TO CHECK--------------
                                        
$newtag substr($imgtag0$i);
                                        if (
$ahnr == true$newtag .= AHNARLOC;
                                        
$newtag .= substr($imgtag$istrlen($imgtag)-$i+1);
                                        
$imgtag $newtag//instead of return
                                
}
                                else
                                {
                                        
$newtag '';
                                        
$newtag substr($imgtag0$i);
                                        if (
$ahnr == true$newtag .= AHNARLOC;
                                        if ((
$imgtag[$i] != '/') && ($imgtag[$i] != '.'))
                                                
$newtag .= $this->domain;
                                        elseif (
$imgtag[$i] == '/')
                                        {
                                                
$slpos stripos($this->domain'/'9);
                                                if (
$slpos === false$newtag .= $this->domain; else $newtag .= substr($this->domain0$slpos);
                                        }
                                        elseif (
$imgtag[$i] == '.')    ///??? CHECK
                                        
{
                                                if (
$imgtag[$i+1] == '.')
                                                        
$slpos strripos($this->domain'/');
                                                else
                                                        
$slpos 0//just for next line
                                                
if ($slpos <= 6$newtag .= $this->domain; else $newtag .= substr($this->domain0$slpos);
                                        }
                                        while (
$imgtag[$i] == '.'$i++;
                                        if (
$imgtag[$i] != '/'$newtag .= '/';
                                        
$newtag .= substr($imgtag$istrlen($imgtag)-$i+1);
                                        
$imgtag $newtag//instead of return
                                
}
                        }
                        
$len strlen($imgtag);
                }

                return 
$newtag;
        }

        private function 
checkBoldCSS($tag)  //returns true if style attribute present and bold is set
        
{
                
$stpos stripos($tag"style");
                if (
$stpos === false) return;

                if (
stripos($tag"bold"$stpos 5)) return true;

                return 
false;
        }


        private function 
getValue($atag$attrib)  //returns destination of (href) <a> tag
        
{
                
$hrefpos stripos($atag$attrib);
                if (
$hrefpos === false) return "";

                
$epos stripos($atag"="$hrefpos);
                if ((
$epos === false) || ($epos $hrefpos+strlen($attrib)+3)) return "";
                
$qpos stripos($atag'"'$epos);
                if ((
$qpos !== false) && ($qpos $epos+3))
                        
$fqpos stripos($atag'"'$qpos+1); //closing "
                
else
                {
                        
$qpos stripos($atag"'"$epos);

                        if ((
$qpos !== false) && ($qpos $epos+3))
                                
$fqpos stripos($atag"'"$qpos+1); //closing '
                        
else
                        {
                                
$qpos $epos;
                                
$fqpos stripos($atag" "$qpos);
                                if (
$fqpos === false$fqpos strlen($atag)-1;
                        }
                }

                if (
$qpos+>= $fqpos) return "";

                return 
trim(substr($atag$qpos+1$fqpos-$qpos-1));
        }

        private function 
remAttrib($tag$attrib)
        {
                
//identical to above
                
$attribpos stripos($tag$attrib);
                if (
$attribpos === false) return $tag;

                
$epos stripos($tag"="$attribpos);
                if ((
$epos === false) || ($epos $attribpos+strlen($attrib)+3)) return "";
                
$qpos stripos($tag'"'$epos);
                if ((
$qpos !== false) && ($qpos $epos+3))
                        
$fqpos stripos($tag'"'$qpos+1); //closing "
                
else
                {
                        
$qpos stripos($tag"'"$epos);

                        if ((
$qpos !== false) && ($qpos $epos+3))
                                
$fqpos stripos($tag"'"$qpos+1); //closing '
                        
else
                        {
                                
$qpos $epos;
                                
$fqpos stripos($tag" "$qpos);
                                if (
$fqpos === false$fqpos strlen($tag)-1;
                        }
                }

                
//addon
                
$newtag substr($tag0$attribpos);
                
$newtag .= substr($tag$fqpos+1strlen($tag)-$fqpos-1);

                return 
$newtag;
        }

        private function 
idDestType($path)
        {
                
$extpos strrpos($path".");
                if (
$extpos === false) return 0//not file

                
$ext substr($path$extpos4);
                if (
$ext == ".htm") return 1;
                if (
$ext == ".pdf") return 3;
                if ((
$ext == ".jpg") || ($ext == ".gif") || ($ext == ".bmp") || ($ext == ".png") || ($ext == ".jpe"))
                        return 
4;

                if (
$ext == ".zip") return 55;
                if (
$ext == ".rar") return 56;
                if (
$ext == ".exe") return 57;

                return 
1//default
        
}

    } 
//class

?>