MrWhufc Posted December 5, 2011 Share Posted December 5, 2011 Hi guys, I'm working on a side-project whilst my work at Uni is at a low but I'm extremely rusty on PHP; I've not touched it since around '07. Before I get on to my project, which will lift information from external websites, I'm just trying to acquaint myself with cURL again. Before I get flamed, yes this code is largely lifted from a website, but I pretty much understand it. My aim is to be able to get the title of a website, namely Facebook. Whilst this code is fine on seemingly any other website, it doesn't seem to be able to work for Facebook. I've checked the source code, and I really can't see the problem. Can anyone shed some light for me? <?php $url = 'http://www.facebook.com/'; $webinfo = get_data($url); $title = get_match('/<title>(.*)<\/title>',$webinfo); $content= '<h2>Title of webpage: </h2><p>'.$name.'</p>'; echo $content; function get_match($regex,$content) { preg_match($regex,$content,$matches); return $matches[1]; } //gets the data from a URL function get_data($url) { $ch = curl_init(); $timeout = 5; curl_setopt($ch,CURLOPT_URL,$url); curl_setopt($ch,CURLOPT_RETURNTRANSFER,1); curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,$timeout); $data = curl_exec($ch); curl_close($ch); return $data; } ?> Thanks!!! Quote Link to comment https://forums.phpfreaks.com/topic/252517-big-problem/ Share on other sites More sharing options...
QuickOldCar Posted December 5, 2011 Share Posted December 5, 2011 It's not as easy as appears to get the titles from all websites. Some chunks of the code I use for my index. <?php $url = trim("facebook.com"); //parse the url function getparsedHost($new_parse_url) { $parsedUrl = parse_url(trim($new_parse_url)); return trim($parsedUrl['host'] ? $parsedUrl['host'] : array_shift(explode('/', $parsedUrl['path'], 2))); } /*connect to the url using curl to see if exists and get the information*/ //$cookie = tempnam('tmp','cookie'); //$cookie_file_path = "tmp/"; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); //curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie); //curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_file_path); curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3'); curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1); curl_setopt($ch, CURLOPT_TIMEOUT, 15); curl_setopt($ch, CURLOPT_MAXREDIRS, 15); curl_setopt($ch, CURLOPT_HEADER, 1); curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt ($ch, CURLOPT_FILETIME, 1); curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); curl_setopt($ch, CURLOPT_ENCODING , ""); $curl_session = curl_init(); //curl_setopt($curl_session, CURLOPT_COOKIEJAR, $cookie); //curl_setopt($curl_session, CURLOPT_COOKIEFILE, $cookie_file_path); curl_setopt($curl_session, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3'); curl_setopt($curl_session, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1); curl_setopt($curl_session, CURLOPT_ENCODING , ""); curl_setopt($curl_session, CURLOPT_TIMEOUT, 15); curl_setopt($curl_session, CURLOPT_HEADER, 1); curl_setopt($curl_session, CURLOPT_SSL_VERIFYPEER, FALSE); curl_setopt($curl_session, CURLOPT_HEADER, true); curl_setopt($curl_session, CURLOPT_MAXREDIRS, 15); curl_setopt($curl_session, CURLOPT_RETURNTRANSFER, true); curl_setopt( $curl_session, CURLOPT_AUTOREFERER, true ); curl_setopt ($curl_session, CURLOPT_HTTPGET, true); curl_setopt($curl_session, CURLOPT_URL, $url); $string = mysql_real_escape_string(curl_exec($curl_session)); $html = mysql_real_escape_string(curl_exec ($ch)); $info = curl_getinfo($ch); /*curl response check and to resolve url to the actual location*/ $response = curl_getinfo( $ch ); if ($response['http_code'] == 301 || $response['http_code'] == 302) { ini_set("user_agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3"); $headers = get_headers($response['url']); $location = ""; foreach( $headers as $value ) { if ( substr( strtolower($value), 0, 9 ) == "location:" ) return get_final_url( trim( substr( $value, 9, strlen($value) ) ) ); } } if ( preg_match("/window\.location\.replace\('(.*)'\)/i", $con_addtent, $value) || preg_match("/window\.location\=[\"'](.*)[\"']/i", $con_addtent, $value) || preg_match("/location\.href\=[\"'](.*)[\"']/i", $con_addtent, $value) ) { $finalurl = get_final_url($value[1]); } else { $finalurl = $response['url']; } $html = curl_exec($ch); $header = "Location: "; if (!$html) { ?> <br /><FONT COLOR=red>No url inserted:</b></FONT> <br /><B><FONT COLOR=orange>Please try another url, that website may not exist. The url may or may not require the www.</b></FONT><br /> <?php exit; } if (curl_errno($ch)) { ?> <B><FONT COLOR=orange> <?php curl_error($ch); ?> </b></FONT><br /> <?php } else { ?> <br /> <?php $errmsg = curl_error($ch); curl_close($ch); $valid = array(200, 201, 202, 203, 204, 205, 206, 207, 300, 301, 302, 303, 304, 305, 306, 307); if (in_array($info['http_code'], $valid)) { ?> <B><FONT COLOR=lime>Connection OK</b></FONT> <?php } $invalid = array(400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 500, 501, 502, 503, 504, 505, 506, 507, 510); if (in_array($info['http_code'], $invalid)) { ?> <B><FONT COLOR=red>Connection Error</b></FONT> <?php } ?> <br /> <?php $redirected = array(300, 301, 302, 303, 307); if (in_array($info['http_code'], $redirected)) { ?> <B><FONT COLOR=orange>Redirection</b></FONT> <?php } $redirectedno = array(200, 201, 202, 203, 204, 205, 206, 207); if (in_array($info['http_code'], $redirectedno)) { ?> <B><FONT COLOR=lime> Direct Connection</b></FONT> <?php } print <<<END <br /> END; /*grabs the title description and keywords - last effort grabs any information could from head or header*/ $ok=1; //meta $title = null; if(preg_match('/<title>([^>]*)<\/title>/smiU', $html, $match)){ if (isset($match) && is_array($match) && count($match) > 0) { $title = strip_tags(trim($match[1])); } } elseif(preg_match("/<title>(.*)<\/title>/smUi",$html, $match)){ if (isset($match) && is_array($match) && count($match) > 0) { $title = strip_tags(trim($match[1])); } } elseif(preg_match("/<title>(.+?)<\/title>/msi",$html, $match)){ if (isset($match) && is_array($match) && count($match) > 0) { $title = strip_tags(trim($match[1])); } } else { preg_match('/<title>([^<]*)<\/title>/smiU', $html, $match); if (isset($match) && is_array($match) && count($match) > 0) { $title = strip_tags(trim($match[1])); } } $title = preg_replace('~\s{2,}~', ' ', $title); $charset = 'None'; $description=''; $keywords=''; preg_match("/<head.*>(.*)<\/head>/smUi",$html, $headers); if(count($headers) > 0) { if(preg_match("/<meta[^>]*http-equiv[^>]*charset=(.*)(\"|')>/Ui",$headers[1], $results)){ $charset= $results[1]; } else { $charset='None'; } } else { $ok=0; //echo 'No HEAD - Might be malformed or be a feed<br />'; } if($charset != 'None'){ $title=iconv($charset, "UTF-8", $title); } if($title == null){ $title = $finalurl; } $parsed_url = getparsedHost($finalurl); echo "Url: <a href='$finalurl'>$parsed_url</a><br />Title: $title"; } ?> If just need a title of the page there are other simpler methods to do this. But curl has more options and able to do more with it. Quote Link to comment https://forums.phpfreaks.com/topic/252517-big-problem/#findComment-1294678 Share on other sites More sharing options...
ManiacDan Posted December 5, 2011 Share Posted December 5, 2011 Your regex is malformed and will never work on any website, I don't know how it's working for sites other than facebook. It should end in a slash. You also may not be getting a well-formed document from facebook, or their title tag may be on multiple lines. Quote Link to comment https://forums.phpfreaks.com/topic/252517-big-problem/#findComment-1294689 Share on other sites More sharing options...
Recommended Posts
Join the conversation
You can post now and register later. If you have an account, sign in now to post with your account.