<?php
        
/////////////////////////////////////////////////////////
        //
        //  AHNAR CSA3004 APT University of Malta
        //  Ian Bugeja 2005
        //
        //////////////////////////////////////////////////////////

        
require_once "PorterStemmer.php";

        class 
AdaptiveHyp
        
{
                
//cwords is an array with common english words (stop words)
                
private static $cwords = array('says''him''into''www''between''some''much''other''new''last''most''only''our''said''how'"don't"'get''what''which''them''why''over''would''see''the''and''you''her''she''that''about''links''link''post''was''for''were''not''has''also''its''been''are''with''his''they''one''have''this''from''had''but''can''out''all''your''more''will''none''here''could''their''there''when''since'"didn't"'these''who');
                
//clinks is an array with common links which should not be suggested
                
private static $clinks = array('ahnoo''img''email''e-mail''about us''privacy''home''index''back''sitemap''site map''favorites''bookmark''top''top of page''contents''guest book''forum''broken''rss feeds''bad link''register''options''contact us''disclaimer''newsletter''send''comment''post''search''print''comment''sign in''sign out''login''logout''search''video');
                private static 
$clinkspart = array('achname''applic''email''e-mail''subscribe''bookmark''forum''search''newsletter''homepage''home page''post''contact''download');
                
//ilinks is an array which contains important link text
                
private static $ilinks = array('read more''see also''read''more''full story''next''learn more');
                private static 
$cdom = array('.com''.org''.tv''.ws''.net''.edu''.mt''.uk');

                public static function 
common($wrd//returns true if a word is a popular english word
                
{
                        if (
strlen($wrd) <= 2) return true//2 or less words common
                        
if ($wrd[0] == '&') return true// eliminate &amp; &nbsp; etc...
                        
$wrd strtolower($wrd);

                        if (
in_array($wrdAdaptiveHyp::$cwords)) return true;

                        
$domi strrpos($wrd"."); //stristr($wrd, '.');
                        
if ($domi !== false)
                        {
                                
$dom substr($wrd$domistrlen($wrd)-$domi);
                                if (
in_array($domAdaptiveHyp::$cdom))
                                {
                                        if (
stripos($wrd$dom) >= count($wrd) - 4)
                                                return 
true//treat domain names as common
                                
}
                        }
                        return 
false;
                }

                public static function 
commonlink($link//common used words for links
                
{
                    
//if (in_array($link, AdaptiveHyp::$clinks)) return true;

                    
foreach(AdaptiveHyp::$clinks as $c)
                    {
                        if (
AdaptiveHyp::equal($link$c)) return true;
                    }

                    foreach(
AdaptiveHyp::$clinkspart as $c//clinkspart contains words which are not allowed
                    
{
                        if (
stripos($link$c) !== false) return true;
                    }

                    if (
trim($link) == "") return true//empty links

                    
return false;
                }

                public static function 
implink($link)   //returns true if links is an important link
                
{
                        foreach(
AdaptiveHyp::$ilinks as $il)
                        {
                                if (
AdaptiveHyp::equal($link$il))    ///????
                                
{
                                        return 
true;
                                }
                                
//elseif (stripos($link, $il) !== false)
                                //        return true;
                        
}

                        return 
false;
                }

                public static function 
equal($word1$word2//compares 2 words
                
{
                        
$word1 trim($word1".-_ ");
                        
$word2 trim($word2".-_ ");
                        if (
strncmp($word1$word23) != 0) return false;  //words must start with same characters
                        
$lev levenshtein($word1$word2);
                        if (
$lev == 0) return true;
                        if (
strpbrk($word1'()_[]{}') !== false) return false//solves problem if word contains brackets/underscore
                        
if (strpbrk($word2'()_[]{}') !== false) return false;
                        if (
is_numeric($word1)) return false//incase of numbers (can only be identical
                        
if (($lev <= 2) && (strlen($word1) > 4)) return true;
                        if (
$lev 4)
                        { 
//if less than 4 words r similar but force a furthur comparison to check how similar they are
                                
$p 3//start from 3 cause first 3 chars have already been verified above
                                
$min min(strlen($word1), strlen($word2));
                                
$max max(strlen($word1), strlen($word2));
                                if (
$max-$min 3) return false//not equal;
                                
if (($max >= 6) && (strncmp($word1$word24) != 0)) return false;

                                while ((
$word1[$p] == $word2[$p]) && ($p $min)) $p++;

                                if (
$p $min-2)
                                        return 
false//not equal    ????????????????????????????
                                
else
                                        return 
true;
                        }

                        return 
false//not equal
                
}

                public function 
popular($words$alreadycommon$maxa 8//computes the 8 most common words in the page
                
{
                        
$words strtolower($words);
                        
$words str_replace("."" "$words);
                        
$words str_replace(","" "$words);
                        
$words str_replace(":"" "$words);
                        
$words str_replace(";"" "$words);
                        
$words str_replace("'"" "$words);

                        
$splitarr explode(' '$words); //create array from string
                        
$countsplit array_count_values($splitarr);   //count words which are equal
                        
arsort($countsplit);
                        
reset($countsplit);
                        
$topten 0;
                        
$addedkeys 0;
                        
$count count($countsplit);
                        while (
$i $count)
                        {
                                
$kk key($countsplit);
                                
$kk trim($kk"',./\\!@#%^()-_+=[]{}|~`<>?:;\" ");  //characters to remove from words
                                
$val current($countsplit);

                                
$stem PorterStemmer::Stem($kk);
                                if (
strlen($stem) < 4$stem $kk;

                                if ((
$kk != ' ') && ($topten $maxa))
                                {
                                        if ((
AdaptiveHyp::common($kk) == false) && ($val 2))
                                        {
                                                
$keywords[] = array($kk$val111$stem);  //key, occurrance, updatedlast, age, updated times
                                                
$addedkeys++;
                                                
$topten++;
                                        }
                                }
                                elseif (
$kk != ' ')
                                { 
//if keyword was not in top 10 but is already present in keylist add too
                                        
if (isset($alreadycommon))
                                        foreach(
$alreadycommon as $al)
                                        {
                                                if (
$al[0] == $kk)
                                                {
                                                        
$keywords[] = array($kk$val111$stem);
                                                        
$addedkeys++;
                                                }
                                        }
                                }

                                
next($countsplit);
                                
$i++;
                        }

                        if (
$addedkeys == 0) unset($keywords);

                        return 
$keywords;
                }

                public function 
adaptKeyLinks($atagpos$atagcount$atagwords$keywords$contents)
                {
                        
$count 0;
                        
//suggest links that have similar text to keywords
                        
if (isset($atagwords))
                        foreach (
$atagwords as $k => $l)
                        {
                             if (
AdaptiveHyp::implink($l)) //Important links check if text before contains keywords
                             
{
                                
$pretext substr($contents$atagpos[$k]-2000,2000);
                                
$pretext strip_tags($pretext);
                                
$pretext strtolower($pretext);
                                
$total 0;
                                if (isset(
$keywords))
                                foreach(
$keywords as $kw)
                                        
$total += substr_count($pretext$kw[5]);   //?????????

                                
if ($total >= 5)
                                {
                                        
$restags[$k.""] = $atagpos[$k];
                                        
$count++;
                                }
                             }
                             else
                             {
                                if (isset(
$keywords))
                                foreach (
$keywords as $kw)
                                {
                                     if (
AdaptiveHyp::relativeImpKW($kw))
                                     if (
AdaptiveHyp::commonlink($l) == false//avoid common links
                                     
{
                                          
//if (stripos($atagwords[$k], $kw[5]) !== false) //keyword in or substring
                                          
if (AdaptiveHyp::wordpos($atagwords[$k], $kw[0]) == true)
                                          {
                                                  
$restags[$k.""] = $atagpos[$k];
                                                  
$count++;
                                          }
                                     }
                                }
                             }
                        }

                        if (
$count 1) unset($restags);
                        return 
$restags;
                }

                public function 
adaptLinks($atagpos$atagcount$contents)    //returns links that are adaptable
                
{
                        if (
$atagcount <= 5) return;
                        
//if <a> tag is in the middle of text
                        
for($i=0$i $atagcount$i++) //??? atagcount to check if equal to count($atagpos)
                        
{
                                
$x $atagpos[$i] - 20//20 chars before tag start
                                
$midtextflag 0;
                                while (
$x $atagpos[$i])
                                {
                                        if (
$contents[$x] == '>')
                                        {
                                                
$midtextflag++;
                                        }
                                        
$x++;
                                }

                                if (
$midtextflag 1//assume in middle of text
                                
{
                                        
$restags[$i.""] = $atagpos[$i]; //add to list of links to adapt
                                
}
                                else
                                {
                                        
$s substr($contents$atagpos[$i] - 2020);
                                        
//bold tag near
                                        
if ((stripos($s"bold")) || (stripos($s"strong")) || (stripos($s"<h1")) || (stripos($s"<h2")) || (stripos($s"<h3")) || (stripos($s"<b")))
                                                
$restags[$i.""] = $atagpos[$i]; //add to list of links to adapt
                                
}
                        }

                        return 
$restags;
                }

                public static function 
beforeAfterLink($atag$contents$keywords//checks if any keywords are near the keyword
                
{
                        
$before substr($contents$atag-4140);
                        
$enda stripos($contents'</a'$atag+1);  ///does not consider link text
                        
$after substr($contents$enda+440);
                        
$befaft $before .' '$after;

                        if (isset(
$keywords))
                        foreach(
$keywords as $kw)
                        {
                                if (
AdaptiveHyp::relativeImpKW($kw))
                                
//if (stripos($befaft, ' '.$kw[5]) !== false) //TRUE
                                
if (AdaptiveHyp::wordpos($befaft$kw[0]) == true)
                                {
                                        if (
stripos($before'</a') === false//if </a> tag before this tag do not suggest
                                                
return true;
                                }
                        }

                        
//if tag does not match see if there is bold text
                        
if (stripos($before"bold")) return true;
                        if (
stripos($before"strong")) return true;
                        if (
stripos($before"</h")) return true;
                        if (
stripos($before"<em")) return true;  //Emphasized
                        
if (stripos($before"<b>")) return true;  //bold

                        
return false;
                }

                public static function 
textLink($atext$keywords$imp 0.5)  //sees if any keyword is in atext (link)
                
{       //relevance of keyword....
                        
if (isset($keywords))
                        foreach(
$keywords as $kw)
                        {
                                
//if (AdaptiveHyp::equal($atext, $kw[5])) return true;
                                
if (AdaptiveHyp::relativeImpKW($kw$imp))
                                
//if (stripos($atext, ' '.$kw[5]) !== false) return true;  //keyword stem found in link
                                
if (AdaptiveHyp::wordpos($atext$kw[0]) == true) return true;
                        }

                        
//if (AdaptiveHyp::implink($atext)) return true;

                        
return false;
                }

                public static function 
linkList($atag$contents)
                {
                        
$before substr($contents$atag-4140);
                        
$enda stripos($contents'</a'$atag+1);  ///does not consider link text
                        
$after substr($contents$enda+440);
                        
$before $before .' '$after;

                        if (
stripos($before"</a") !== false) return true;
                        if (
stripos($before"<li") !== false) return true;
                        if (
stripos($before"<ul") !== false) return true;
                        if (
stripos($before"<ol") !== false) return true;
                        if (
stripos($before"<td") !== false) return true;
                        
//if (stripos($before, "<br") !== false) return true;

                        
return false;
                }

                public static function 
relativeImpKW($kw$imp 0.5)  //returns true if the keyword is of relevant importance
                
{
                        if (
$kw[3] < 3) return true;  //age < 3

                        
if ($kw[4] >= $kw[3]*$imp//updated times >= age*0.5
                                
return true;
                        else
                                return 
false;
                }

                public static function 
inPageLink($ataglist$atagtext$linkdest$contents$keywords)
                {
                        
$hashpos stripos($linkdest'#');
                        if (
$hashpos === false) return false//not in Page link (no #)

                        
$name substr($linkdest$hashpos+1strlen($linkdest)-$hashpos-1);
                        
$kpos 0;
                        foreach(
$atagtext as $ka => $kv)
                        {
                              if (
$kv == "achname-".$name)
                              {
                                
$kpos $ka;
                                break;
                              }
                        }

                        if (
$kpos != 0//if found
                        
{
                                
$pos $ataglist[$kpos];
                                
$subpart substr($contents$pos2000);

                                
$tot 0;
                                foreach(
$keywords as $keyw)
                                {
                                        if (
AdaptiveHyp::relativeImpKW($keyw))
                                        {
                                                
$tot += substr_count($subpart$keyw[0]); //count keyword occurance
                                        
}
                                }

                                if (
$tot >= count($keyw) * 1.5)
                                {
                                        return 
true;
                                }
                        }

                        return 
false;
                }


                public static function 
wordpos($str$wrd)
                {
                        
$sw explode(" "$str);
                        foreach(
$sw as $s)
                        {
                                if (
AdaptiveHyp::equal($s$wrd) == true)
                                {
                                        return 
true;
                                }
                        }

                        return 
false;
                }

        }

?>