Jump to content

huge amount of memory


arun4444

Recommended Posts

Hello Folks, i am pretty new to php programming, i have written a curl script to parse a web-page. It reads from a csv file and writes to one.

 

it consumes a huge amount of memory, what would be a solution for this.

 

heres the code:-

 

<?php

/*********************************
**Include Parser Class**
**********************************/

include('simple_html_dom.php');

/*********************************
**Set up cookie variables**
**********************************/
$cookiefile = tempnam("", "cookies");
// Create a temporary file to store cookies.

$login_url='https://www.dbldistributing.com/index.php?main_page=login';
/* The page that displays the login form. */

$login_post_url='https://www.dbldistributing.com/index.php?main_page=login&action=process';
/* The "action" value of the login form.  */

$username = "XXXXXX";
$password = "XXXXXX";

$agent="Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";

/*********************************************
**Load the "login" page and get some cookies**
*********************************************/

$ch = curl_init();
curl_setopt($ch, CURLOPT_URL,$login_url);
/* The URL of the page to retrieve */

curl_setopt($ch, CURLOPT_USERAGENT, $agent);
/* Disguise self as a browser app. */

curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
/* Don't output the results -
return them as a string instead */

curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
/* Follow redirects.
This isn't actually necessary here  */

curl_setopt($ch, CURLOPT_COOKIEFILE, $cookiefile);
/* Read cookies from this file */

curl_setopt($ch, CURLOPT_COOKIEJAR, $cookiefile);
/* Save cookies to the same file too */

/* SSL stuff */
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST,  2);
/* Check the existence of a common name and also
verify that it matches the hostname provided.*/

/* SSL stuff */
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);

$result = curl_exec ($ch);
/* Perform the query, retrieve the page. */

curl_close ($ch);

$nothing = "";

/*************************************
Actually log in with the proper referer and cookies
**************************************/

/* The fields of the login form. */
$postfields = array(
'email_address'	=> $username,
'password' => $password,
//'rememberMe' => 'false',
'x' => $nothing,
'y' => $nothing,
);

$reffer = $login_url;
/* If the server checks the referer we need to spoof it */

$ch = curl_init();

curl_setopt($ch, CURLOPT_URL,$login_post_url);
curl_setopt($ch, CURLOPT_USERAGENT, $agent);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS,
       http_build_query($postfields));
/* http_build_query() will properly escape the fields and
build a query string. */

curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
/* Follow redirects.*/
curl_setopt($ch, CURLOPT_REFERER, $reffer);
/* spoof the HTTP referer */

curl_setopt($ch, CURLOPT_COOKIEFILE, $cookiefile);
curl_setopt($ch, CURLOPT_COOKIEJAR, $cookiefile);

/* Note that this is the same file as before */

curl_setopt($ch, CURLOPT_SSL_VERIFYHOST,  2);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);

$result = curl_exec ($ch);
/* Now we've got the contents of the page you see after
logging in saved in $result */

curl_close ($ch);

/********************************************
**load data from file and process each line**
********************************************/

$row = 1;
$handle = fopen("inputcsv.csv", "r");

$today = date("m.d.y.G.i.s");
$today .= "output.csv";

$fp = fopen($today, 'w');
while (($data = fgetcsv($handle, 1000, ",")) !== FALSE) {

    $num = count($data);
    $itemid = $data[0];
    $itemcode = $data[1];
    $itemurl = $data[2];

    //echo "$itemid $itemcode $itemurl";


/*****************************************
******Get Html data for each Page*********
******************************************/

$data_url='https://www.dbldistributing.com/index.php?main_page=product_info&products_id=';
$data_url .= trim($itemurl);
$reffer = $login_post_url;

$ch = curl_init();

curl_setopt($ch, CURLOPT_URL,$data_url);
curl_setopt($ch, CURLOPT_USERAGENT, $agent);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_REFERER, $reffer);
curl_setopt($ch, CURLOPT_COOKIEFILE, $cookiefile);
curl_setopt($ch, CURLOPT_COOKIEJAR, $cookiefile);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST,  2);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);

$result = curl_exec ($ch);
curl_close ($ch);

$html = str_get_html($result);

// find all font tags with attribite class=WHS001 - get quantity
foreach($html->find('font[class=WHS001]') as $e)
    $quantity = preg_replace("/[^0-9]/", '', $e->innertext); // ditch anything that is not a number

    $quantity = trim($quantity);

    // find all td tags with attribite class=prodprice02 - get price
    foreach($html->find('TD[class=prodprice02]') as $f)
    $priceee = preg_replace("/[^0-9.]/", '', $f->innertext); // ditch anything that is not a number

$priceee = trim($priceee);

$list = $itemid;
$list .= ",";
$list .= $itemcode;
$list .= ",";
$list .= $quantity;
$list .= ",";
$list .= $priceee;

fputcsv($fp, split(',', $list));
echo "row $row complete <br>";
$row++;
}

fclose($handle);
fclose($fp);

/**************************************************************
**All done. Kill the cookie file once it's not needed anymore**
**************************************************************/

unlink($cookiefile);

?>


Link to comment
https://forums.phpfreaks.com/topic/119906-huge-amount-of-memory/
Share on other sites

It appears inside the loop you are running a curl transaction.

If you have PHP5 installed. Have you looked at the multi-curl options?

 

I've had a lot of success handling many curl connections using them in parallel.

http://us.php.net/manual/en/function.curl-multi-init.php

 

instead of the foreach loop as well.. you can use simpleXML or DOM extensions to issue an xpath query on the html since it is ultimately xml. This could be passed like ...

 

$xml = new SimpleXMLElement($string);

$xml->xpath('//font[@class = WHS001]');

 

might be what you're already doing with the other library, but it works well for scraping large sites.

 

Archived

This topic is now archived and is closed to further replies.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.