Jump to content

kangkingkong

New Members
  • Posts

    5
  • Joined

  • Last visited

kangkingkong's Achievements

Newbie

Newbie (1/5)

0

Reputation

  1. ah yes i just found a code tabbed outside my screen....got it working now...thanks !!!!
  2. Heres the code $file_string = file_get_contents('http://www.emeraldinsight.com/search.htm?st1='.$searchterm.'&ec=1&bf=1&ct=jnl&nolog=503574&displayno=30&page='.$currentpagenumber.''); preg_match_all('#<span class=\"header\">(.*?)</span>#s', $file_string, $titles); if( count($titles[1]) > 0) { for($i = 0; $i < count($titles[1]); $i++) { echo "my count".$i." contains : ".$titles[1][$i]."<br>"; } }
  3. Hi, Thank your for the suggestion. Now it does return the value i previously failed to get. However, now this value is failed to return : <span class="header">The provision of European information by public libraries in the UK</span> Im confuse
  4. Hi, I have problem getting the value in this string. The regex im using is : preg_match_all('#<span class=\"header\">(.*)</span>#', $file_string, $titles); and it returns fine when i search <span class="header">A Radiological Assessment of Scottish Edible Seaweed Consumption</span> but when i search : <span class="header">A Radiological Assessment of Scottish Edible Seaweed Consumption</span> it returns Null....As far as I can see, its the new line in the middle of the string cause the regrex to just pass it. Any help please ? Thank you.
  5. Hi guys, Im trying to make a webscrapper and im having a huge problem when retrieving huge amount of data. I have tried to increase the memory through PHP.ini but its still doesnt solve the problem. The webscrapper I want to make is to retrieve data from database journal and put it into an excel file. While it is working with small datasets, it will run out of memory when retrieving large datasets. Here is the code : x <?php function fetchRawData($url,$search,$currentpagenumber,$numpage,$numrecordtotalsofar) { if($currentpagenumber<$numpage) { //initialise curl $url = "http://ieeexplore.ieee.org/search/searchresult.jsp?queryText%3D".$search."&rowsPerPage=100&pageNumber=".$currentpagenumber."&resultAction=ROWS_PER_PAGE"; echo "<br>new url = ".$url."<br>"; $ch = curl_init(); curl_setopt($ch,CURLOPT_URL,$url); curl_setopt($ch,CURLOPT_RETURNTRANSFER,true); curl_setopt($ch,CURLOPT_FAILONERROR,true); curl_setopt($ch,CURLOPT_FOLLOWLOCATION,true); // curl_setopt($ch,CURLOPT_TIMEOUT,50000); curl_setopt($ch,156,500000000); curl_setopt($ch,155,500000000); curl_setopt($ch,CURLOPT_SSL_VERIFYPEER,false); curl_setopt($ch,CURLOPT_SSL_VERIFYHOST,false); $data=curl_exec($ch); if(!$data) { var_dump(curl_getinfo($ch)); die(); } //parsing data $parsedData = array(); phpQuery::newDocumentHTML($data); $arrtitle = array(); $posttitle=1; if($currentpagenumber<2) { $numrecordsofar=1; } else { $numrecordsofar=$numrecordtotalsofar; } //get the title, author and year of publication foreach(pq("a") as $link) { $title = pq($link)->text(); if($title) { //use regular expression to get the relevant information if (preg_match("*articleDetails.jsp*", pq($link)->attr('href'))&&$gettitle<1) { if(!(preg_match("*View full abstract*", $title))) { $dummyvar=$numrecordsofar+$posttitle; array_push($arrtitle,$title); $countrecord++; $gettitle=1; } } } } //get the number of data foreach(pq("span") as $link) { $title = pq($link)->text(); if($title) { if (preg_match("*display-status results-returned*", pq($link)->attr('class'))) { $countnumberonly = preg_replace("*Results returned*", "", $title); $totalpageint = intval($countnumberonly); //calculate how many pages needed and record the current page $totalpageint = intval($totalpageint / 100)+2; } } } //initialise write to excel $objPHPExcel = new PHPExcel(); $objPHPExcel->getProperties()->setCreator("Maarten Balliauw") ->setLastModifiedBy("Maarten Balliauw") ->setTitle("PHPExcel Test Document") ->setSubject("PHPExcel Test Document") ->setDescription("Test document for PHPExcel, generated using PHP classes.") ->setKeywords("office PHPExcel php") ->setCategory("Test result file"); // Set active sheet index to the first sheet, so Excel opens this as the first sheet $objPHPExcel = PHPExcel_IOFactory::load("IEEE_Scrap.xlsx"); $objPHPExcel->setActiveSheetIndex(0); $objPHPExcel->createSheet(); $row = $objPHPExcel->getActiveSheet()->getHighestRow()+1; //get data from arrays for($j=0;$j<count($arrtitle);$j++) { if(isset($arrtitle[$j])) { $dummyvar=$numrecordsofar+$j; $objPHPExcel->getActiveSheet()->SetCellValue('A'.$dummyvar,$arrtitle[$j]); } else { $dummyvar=$numrecordsofar+$j; $globalIEEE[$tempcount+$j][0]="No Data"; $objPHPExcel->getActiveSheet()->SetCellValue('A'.$dummyvar,"No Data"); } } $objWriter = new PHPExcel_Writer_Excel2007($objPHPExcel); $objWriter->save('IEEE_Scrap.xlsx'); //close curl and phpexcel curl_close($ch); unset($ch); unset($objPHPExcel); unset($objWriter); $currentpagenumber++; $numrecordtotalsofar=$numrecordtotalsofar+$countrecord; set_time_limit(0); sleep(5); $rawHTML = fetchRawData($url,$search,$currentpagenumber,$totalpageint,$numrecordtotalsofar); return $data; } } ?> The logic is first I retrieve the data on a page then putting it into an array after parsing it then initalise phpexcel to write the data from the array into excel then unset cURL and phpexcel and then move on to next page. Sorry the code is a bit messy as I have tried so many modifications but still cant get it work. Please help me !
×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.