wee493 Posted December 30, 2009 Share Posted December 30, 2009 I'm collecting data from the twitter streaming api. I'm gettting about 1,400 tweets a minute to parse through. I'm collecting the username, profile image location, and tweet. With the tweet I'm going through and searching through every word for a link. This is happening in two seperate processes. 1) is collecting the data into a sql table 2) is parseing through the data putting it into it's correct tables. My problem is that the script that parses through the data seems to be limited at 40 a minute. It will only parse through 40 tweets a minute at best. I have tried running two instances of the script at once and i get 80 a minute. I have tried running it on another host and got 40 a minute. Is there a setting I need to change? I'm never going to parse this data with 1,400 comming in and only 40 of them processed.... Here is the base of my process script if it helps anyone understand what I'm trying to say. $get_data = mysql_query("SELECT id, tweets FROM raw_data WHERE processed = '0' ORDER BY `raw_data`.`id` ASC"); while($r = mysql_fetch_array($get_data)){ $iidd = $r['id']; $tweet = json_decode($r['tweets']); $update = mysql_query("UPDATE raw_data SET processed = '1' WHERE id = '$iidd'"); if ($tweet->user->screen_name != '' && $tweet->text != ''){ $date = date("U"); $user = mysql_real_escape_string($tweet->user->screen_name); $twt = mysql_real_escape_string($tweet->text); $prof_img = $tweet->user->profile_image_url; $link = explode(' ', $tweet->text); foreach ($link as $li){ $li = str_replace(')', '', $li); $li = str_replace('(', '', $li); $pos = substr($li, 0, 4); if( $pos == 'http' ){ $link = $li; if (substr($link, 7, 6) == 'bit.ly'){ // if bit.ly link $long_url = bitly_convert($link); } } else { $link = 'not found'; } } Quote Link to comment https://forums.phpfreaks.com/topic/186722-php-sql-problems-being-limited-at-40/ Share on other sites More sharing options...
premiso Posted December 30, 2009 Share Posted December 30, 2009 The query is slowing you down. Store any processed id in an array and do the update separate. $ids = array(); while($r = mysql_fetch_array($get_data)){ $iidd = $ids[] = $r['id']; $tweet = json_decode($r['tweets']); //$update = mysql_query("UPDATE raw_data SET processed = '1' WHERE id = '$iidd'"); // your other code here. } // end while mysql_query("UPDATE raw_data SET processed = '1' WHERE id IN(" . implode(',', $ids) . ")") or trigger_error("Query Failed: " . mysql_error()); Now given that, it seems you omitted a good chunk of code, which I have my suspicions does an insert query. If you would like this tweaked for that as well post the rest of that code. But you optimize the querys / take them out of the loop you will be able to process a ton more than just 1400 a minute. Quote Link to comment https://forums.phpfreaks.com/topic/186722-php-sql-problems-being-limited-at-40/#findComment-986077 Share on other sites More sharing options...
wee493 Posted December 30, 2009 Author Share Posted December 30, 2009 The query is slowing you down. Store any processed id in an array and do the update separate. $ids = array(); while($r = mysql_fetch_array($get_data)){ $iidd = $ids[] = $r['id']; $tweet = json_decode($r['tweets']); //$update = mysql_query("UPDATE raw_data SET processed = '1' WHERE id = '$iidd'"); // your other code here. } // end while mysql_query("UPDATE raw_data SET processed = '1' WHERE id IN(" . implode(',', $ids) . ")") or trigger_error("Query Failed: " . mysql_error()); Now given that, it seems you omitted a good chunk of code, which I have my suspicions does an insert query. If you would like this tweaked for that as well post the rest of that code. But you optimize the querys / take them out of the loop you will be able to process a ton more than just 1400 a minute. Actually later in the script I delete the processed id's, so next time the script runs it will basically be starting from the top. not hacing to go through all the id's until it finds one that's not processed. I have removed the from the script and am now posting he rest of the code. $get_data = mysql_query("SELECT id, tweets FROM raw_data WHERE processed = '0' ORDER BY `raw_data`.`id` ASC"); while($r = mysql_fetch_array($get_data)){ $id = $r['id']; $tweet = json_decode($r['tweets']); if ($tweet->user->screen_name != '' && $tweet->text != ''){ $date = date("U"); $user = mysql_real_escape_string($tweet->user->screen_name); $twt = mysql_real_escape_string($tweet->text); $prof_img = $tweet->user->profile_image_url; $link = explode(' ', $tweet->text); foreach ($link as $li){ $li = str_replace(')', '', $li); $li = str_replace('(', '', $li); $pos = substr($li, 0, 4); if( $pos == 'http' ){ $link = $li; if (substr($link, 7, 6) == 'bit.ly'){ // if bit.ly link $long_url = bitly_convert($link); } } else { $link = 'not found'; } } $delete_row = mysql_query("DELETE FROM raw_data WHERE id = '$id'"); /*echo 'Username: '.$tweet->user->screen_name.'<br>'; echo 'Tweet: '.$tweet->text.'<br>'; echo 'Date: '.$date.'<br>'; echo 'Link: '.$link.'<br>'; echo '<img src="'.$tweet->user->profile_image_url.'">'; echo '<hr>'; */ // insert data into db if ($link != 'not found'){ $sql = mysql_query("SELECT id FROM tweets WHERE username = '$user' and tweet = '$twt' LIMIT 1"); if (mysql_num_rows($sql) == '0'){ // Check if user exists in db // $usr_chk = mysql_query("SELECT id FROM users WHERE username = '$user'"); if (mysql_num_rows($usr_chk) == '0'){ // Create new user $sql = mysql_query("INSERT INTO users (username, profile_img) VALUES ('$user', '$prof_img')"); $u_id = mysql_result(mysql_query("SELECT id FROM users WHERE username = '$user'"), 0); } else { $u_id = @mysql_result(mysql_query("SELECT id FROM users WHERE username = '$user'"), 0); $sql = mysql_query("UPDATE users SET profile_img = '$prof_img' WHERE id = '$u_id'"); } // Record link // $sql = mysql_query("SELECT * FROM link WHERE url = '$link' LIMIT 1"); if (mysql_num_rows($sql) == '0'){ // Create new entry $sql = mysql_query("INSERT INTO link (url, count, long_url, first_seen, last_seen) VALUES ('$link', '1', '$long_url', '$date', '$date')"); $link_id = @mysql_result(mysql_query("SELECT id FROM link WHERE url = '$link'"), 0); } else { // Update old Entry $count = @mysql_result(mysql_query("SELECT count FROM link WHERE url = '$link'"), 0); $count = $count + 1; $link_id = @mysql_result(mysql_query("SELECT id FROM link WHERE url = '$link'"), 0); $sql = mysql_query("UPDATE link SET count = '$count', last_seen = '$date' WHERE url = '$link'"); } // Insert Tweet // $sql = mysql_query("INSERT INTO tweets (tweet, date, link_id, username) VALUES ('$twt', '$date', '$link_id', '$user')"); $tweet_id = @mysql_result(mysql_query("SELECT id FROM tweets WHERE link_id = '$link_id' AND date = '$date' AND username = '$user' AND tweet = '$twt'"), 0); // Create relationship $sql = mysql_query("INSERT INTO relationships (username, tweet_id, link_id) VALUES ('$user', '$tweet_id', '$link_id')"); } } Quote Link to comment https://forums.phpfreaks.com/topic/186722-php-sql-problems-being-limited-at-40/#findComment-986084 Share on other sites More sharing options...
premiso Posted December 30, 2009 Share Posted December 30, 2009 Alright, this should be a bit quicker then your old method and perhaps will process more data. I am not sure, as there are still 2 queries that have to be looped due to their nature (I am still trying to think of a way around this). If you have any questions about it let me know. I added the trigger_error for the error reports, used a syntax in MySQL called ON DUPLICATE KEY UPDATE which will allow for 1 less query again and used mysql_insert_id instead of running another query to grab that data. Hopefully it works, unsure as I do not have the system to test it, so pending any syntaxual / SQL issues it should work. <?php $get_data = mysql_query("SELECT id, tweets FROM raw_data WHERE processed = '0' ORDER BY `raw_data`.`id` ASC") or trigger_error('RawDataQueryFailed: ' . mysql_error()); $ids = array(); $users = array(); $insertTweet = array(); $x=0; while($r = mysql_fetch_array($get_data)){ $id = $ids[] = $r['id']; $tweet = json_decode($r['tweets']); if ($tweet->user->screen_name != '' && $tweet->text != ''){ $date = date("U"); $user = $users[] = mysql_real_escape_string($tweet->user->screen_name); $twt = mysql_real_escape_string($tweet->text); $prof_img = $tweet->user->profile_image_url; $link = explode(' ', $tweet->text); foreach ($link as $li){ $li = str_replace(')', '', $li); $li = str_replace('(', '', $li); $pos = substr($li, 0, 4); if( $pos == 'http' ){ $link = $li; if (substr($link, 7, 6) == 'bit.ly'){ // if bit.ly link $long_url = bitly_convert($link); } }else { $link = 'not found'; } } // insert data into db if ($link != 'not found'){ // Record link // $insertTweet[$user] = array('link' => $link, 'long_url' => $long_url, 'user' => $user, 'twt' => $twt, 'prof_img' => $prof_img); $userQuery[] = "('$user', '$prof_img')"; } } } mysql_query("INSERT INTO users (`username`, `profile_img`) VALUES " . implode(', ', $userQuery) . " ON DUPLICATE KEY UPDATE `profile_img` = VALUES(`profile_img`)") or trigger_error("UserQuery Failed: " . mysql_error()); $relations = array(); foreach ($insertTweet as $user => $tweetData) { $link_id = $tweet_id = 0; $sql = mysql_query("INSERT INTO link (url, count, long_url, first_seen, last_seen) VALUES ('{$tweetData['link']}', '1', '{$tweetData['long_url'}', '$date', '$date') ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), count = count+1, last_seen = VALUES(last_seen)") or trigger_error("LinkQuery Failed: " . mysql_error()); if ($sql) { $link_id = mysql_insert_id(); $sql = mysql_query("INSERT INTO tweets (tweet, date, link_id, username) VALUES ('{$tweetData['twt']}', '$date', '$link_id', '$user') ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id)") or trigger_error("TweetQuery Failed: " . mysql_error()); if ($sql) { $tweet_id = mysql_insert_id(); $relations[] = "('{$tweetData[$user]}', '$tweet_id', '$link_id')"; } }else { continue; } } mysql_query("INSERT INTO relationships (username, tweet_id, link_id) VALUES " . implode(", ", $relations) . " ON DUPLICATE KEY UPDATE tweet_id = tweet_id") or trigger_error("RelationQuery Failed: " . mysql_error()); mysql_query("DELETE FROM raw_data WHERE id IN(" . implode(',', $ids) . ")") or trigger_error("DeleteQuery Failed: " . mysql_error()); ?> Quote Link to comment https://forums.phpfreaks.com/topic/186722-php-sql-problems-being-limited-at-40/#findComment-986124 Share on other sites More sharing options...
wee493 Posted December 30, 2009 Author Share Posted December 30, 2009 Alright, this should be a bit quicker then your old method and perhaps will process more data. I am not sure, as there are still 2 queries that have to be looped due to their nature (I am still trying to think of a way around this). If you have any questions about it let me know. I added the trigger_error for the error reports, used a syntax in MySQL called ON DUPLICATE KEY UPDATE which will allow for 1 less query again and used mysql_insert_id instead of running another query to grab that data. Hopefully it works, unsure as I do not have the system to test it, so pending any syntaxual / SQL issues it should work. <?php $get_data = mysql_query("SELECT id, tweets FROM raw_data WHERE processed = '0' ORDER BY `raw_data`.`id` ASC") or trigger_error('RawDataQueryFailed: ' . mysql_error()); $ids = array(); $users = array(); $insertTweet = array(); $x=0; while($r = mysql_fetch_array($get_data)){ $id = $ids[] = $r['id']; $tweet = json_decode($r['tweets']); if ($tweet->user->screen_name != '' && $tweet->text != ''){ $date = date("U"); $user = $users[] = mysql_real_escape_string($tweet->user->screen_name); $twt = mysql_real_escape_string($tweet->text); $prof_img = $tweet->user->profile_image_url; $link = explode(' ', $tweet->text); foreach ($link as $li){ $li = str_replace(')', '', $li); $li = str_replace('(', '', $li); $pos = substr($li, 0, 4); if( $pos == 'http' ){ $link = $li; if (substr($link, 7, 6) == 'bit.ly'){ // if bit.ly link $long_url = bitly_convert($link); } }else { $link = 'not found'; } } // insert data into db if ($link != 'not found'){ // Record link // $insertTweet[$user] = array('link' => $link, 'long_url' => $long_url, 'user' => $user, 'twt' => $twt, 'prof_img' => $prof_img); $userQuery[] = "('$user', '$prof_img')"; } } } mysql_query("INSERT INTO users (`username`, `profile_img`) VALUES " . implode(', ', $userQuery) . " ON DUPLICATE KEY UPDATE `profile_img` = VALUES(`profile_img`)") or trigger_error("UserQuery Failed: " . mysql_error()); $relations = array(); foreach ($insertTweet as $user => $tweetData) { $link_id = $tweet_id = 0; $sql = mysql_query("INSERT INTO link (url, count, long_url, first_seen, last_seen) VALUES ('{$tweetData['link']}', '1', '{$tweetData['long_url'}', '$date', '$date') ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), count = count+1, last_seen = VALUES(last_seen)") or trigger_error("LinkQuery Failed: " . mysql_error()); if ($sql) { $link_id = mysql_insert_id(); $sql = mysql_query("INSERT INTO tweets (tweet, date, link_id, username) VALUES ('{$tweetData['twt']}', '$date', '$link_id', '$user') ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id)") or trigger_error("TweetQuery Failed: " . mysql_error()); if ($sql) { $tweet_id = mysql_insert_id(); $relations[] = "('{$tweetData[$user]}', '$tweet_id', '$link_id')"; } }else { continue; } } mysql_query("INSERT INTO relationships (username, tweet_id, link_id) VALUES " . implode(", ", $relations) . " ON DUPLICATE KEY UPDATE tweet_id = tweet_id") or trigger_error("RelationQuery Failed: " . mysql_error()); mysql_query("DELETE FROM raw_data WHERE id IN(" . implode(',', $ids) . ")") or trigger_error("DeleteQuery Failed: " . mysql_error()); ?> Thank you sooooo much! I did a tiny bit of modifying, but it works and now I can process ~1,500 a minute! The only typo with the code it here Change this $sql = mysql_query("INSERT INTO link (url, count, long_url, first_seen, last_seen) VALUES ('{$tweetData['link']}', '1', '{$tweetData['long_url'}', '$date', '$date') ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), count = count+1, last_seen = VALUES(last_seen)") To This $sql = mysql_query("INSERT INTO link (url, count, long_url, first_seen, last_seen) VALUES ('{$tweetData['link']}', '1', '{$tweetData['long_url']}', '$date', '$date') ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), count = count+1, last_seen = VALUES(last_seen)") the '{$tweetData['long_url']}', was missing the closing ] Quote Link to comment https://forums.phpfreaks.com/topic/186722-php-sql-problems-being-limited-at-40/#findComment-986151 Share on other sites More sharing options...
Recommended Posts
Join the conversation
You can post now and register later. If you have an account, sign in now to post with your account.