<?
/* Dyn2Html : Transform your dynamic web site in a simple static HTML one
 *     version 0.9, 2006-10-05
 *     Created by Alter Systems SARL, http://www.altersystems.fr
 *     Distributed under GPL licence
 *     How To : modify $LIVE_SITE & $OUTPUT_DIR for your own use
 */

	$LIVE_SITE="http://www.altersystems.eu";
	$OUTPUT_DIR="output/";
	$REGEX_AHREF="#<a\s+href\=\"([^\"]*)\"[^>]*>.*?</a>#si";
	$LINK_REPLACE_BAD= array("/","?","=","","","","","","","","","&quot;","&aacute;","&acirc;","&eacute;","&egrave;","&ecirc;","&icirc;","&ocirc;","&ucirc;","&ccedil;","&",":");
	$LINK_REPLACE_GOOD=array("", "" ,"" ,"e","e","e","a","a","i","u","o",""      ,"a"       ,"a"      ,"e"       ,"e"       ,"e"      ,"i"      ,"o"      ,"u"      ,"c"       ,"-","-"); 
	$CONTENT_TYPES=array("text/html");


function isAcceptedContentType($url)
{
	global $CONTENT_TYPES;
	$ch=curl_init($url);
	curl_setopt($ch,CURLOPT_HEADER,true);
	curl_setopt($ch,CURLOPT_NOBODY,true);
	curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
	$headers=trim(curl_exec($ch));
	curl_close($ch);
	$cTypeLabel="Content-Type: ";
	$beginPos=strpos($headers,$cTypeLabel)+strlen($cTypeLabel);
	$cType=substr($headers,$beginPos);
	$endPos=strpos($cType," ");
	if ($endPos!==false)
	{
		$cType=substr($cType,0,$endPos-1);
	}
	return in_array($cType,$CONTENT_TYPES);
}

	
function getContent($url)
{
	$ch=curl_init($url);
	curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
	$content=curl_exec($ch);
	curl_close($ch);
	return $content;	
}

function crawl($url,&$links,&$badLinks)
{
	global $REGEX_AHREF, $LIVE_SITE, $INTERNAL_LINK_PREFIX;
	$content=getContent($LIVE_SITE.$url);
	preg_match_all($REGEX_AHREF,$content,$out,PREG_PATTERN_ORDER);
	foreach ($out[1] as $value)
	{
		if ((strtolower(substr($value,0,strlen($LIVE_SITE)))==strtolower($LIVE_SITE) && strlen($value)>strlen($LIVE_SITE)) || substr($value,0,1)=="?" || substr($value,0,1)=="/")
		// valid internal link, need to keep it
		{
			$initialValue=$value;
			if (strtolower(substr($value,0,strlen($LIVE_SITE)))==strtolower($LIVE_SITE))
			{
				// strip off the live site from the link
				$value=substr($value,strlen($LIVE_SITE));				
			}
			if (substr($value,0,1)=="?")
			{
				$value="/".$value;
			}
			if (!array_key_exists($initialValue,$links) && !array_key_exists($initialValue,$badLinks))
			{
				if (isAcceptedContentType($LIVE_SITE.$value))
				{
					$i=0;
					$generatedURL=getGoodURL($value);
					$goodURL=$generatedURL.".html";
					while (in_array($goodURL,$links))
					{
						$i++;
						$goodURL=$generatedURL.$i.".html";
					}
					$links[$initialValue]=$goodURL;
					crawl($value,$links,$badLinks);
				}
				else 
				{
					$badLinks[$initialValue]="";
					echo $value." does not have a good Content-Type<br/>";
				}
			}
		}
	}
}

function getGoodURL($badURL)
{
	global $LINK_REPLACE_BAD, $LINK_REPLACE_GOOD;
	$url=str_replace($LINK_REPLACE_BAD,$LINK_REPLACE_GOOD,urldecode($badURL));
	if (empty($url))
	{
		$url="index";
	}
	return $url;
}

function outputStatic($links)
{
	global $OUTPUT_DIR;
	// creating the static html pages
	foreach ($links as $badURL=>$goodURL)
	{
		$newContent=getGoodContent($badURL,$links);
		$fileName=$OUTPUT_DIR.$goodURL;
		if ($f=fopen($fileName,"w"))
		{
			if (fwrite($f,$newContent)===FALSE)
			{
				die ("cannot write to file $fileName");
			}
			fclose($f);
		}
		else 
		{
			die("cannot create file $fileName");
		}
		//echo "$fileName written OK<br/>";
	}
}

function getGoodContent($url,$links)
{
	global $LIVE_SITE;
	$content=getContent($LIVE_SITE.$url);
	return replaceLinks($content,$links);
}

function replaceLinks($content, $links)
{
	foreach ($links as $badURL=>$goodURL)
	{
		$content=str_replace("=\"$badURL\"","=\"$goodURL\"",$content);		
	}
	return $content;
}


$links=array("/"=>"index.html");
$badLinks=array();
echo "<br>************** crawling *******************<br>";
crawl("/",$links,$badLinks);
echo "<br>************** crawling done *******************<br>";
echo count($links)." unique valid links found<br/>";
print_r($links);
echo "<br>************** output *******************<br>";
outputStatic($links);
echo "<br>************** output done *******************<br>";

?>	
	 