wee493 Posted March 19, 2010 Share Posted March 19, 2010 Can someone please help me optimize these queries? I'm processing mass amounts of data and currently I'm going through about 450 of these a second. I would like to get this higher is possible as data is coming in faster than being processed. Thanks! $get_data = mysql_query("SELECT * FROM raw_data WHERE processed = '0' ORDER BY `raw_data`.`id` ASC") or trigger_error('RawDataQueryFailed: ' . mysql_error()); while($data = mysql_fetch_array($get_data)){ // continously fetch data $row_id = $data['id']; $tweet = json_decode($data['tweets']); if ($tweet->user->screen_name != '' && $tweet->text != ''){ $date = date("U"); $user = mysql_real_escape_string($tweet->user->screen_name); $twt = mysql_real_escape_string($tweet->text); $prof_img = $tweet->user->profile_image_url; echo 'Username: '.$tweet->user->screen_name.'<br>'; echo 'Tweet: '.$tweet->text.'<br>'; echo 'Date: '.$date.'<br>'; echo 'Link: '.$link.'<br>'; echo '<img src="'.$tweet->user->profile_image_url.'">'; echo '<hr>'; // insert data into db if ($link != 'not found'){ // Check if user exists in db // $usr_chk = mysql_query("SELECT id FROM users WHERE username = '$user'"); if (mysql_num_rows($usr_chk) == '0'){ // Create new user $sql = mysql_query("INSERT INTO users (username, profile_img) VALUES ('$user', '$prof_img')"); $u_id = mysql_insert_id(); } else { $u_id = @mysql_result(mysql_query("SELECT id FROM users WHERE username = '$user'"), 0); $sql = mysql_query("UPDATE users SET profile_img = '$prof_img' WHERE id = '$u_id'"); } // Record link // $sql = mysql_query("SELECT * FROM link WHERE url = '$link' LIMIT 1"); if (mysql_num_rows($sql) == '0'){ // Create new entry $sql = mysql_query("INSERT INTO link (url, count, long_url, first_seen, last_seen) VALUES ('$link', '1', '$long_url', '$date', '$date')"); $link_id = mysql_insert_id(); } else { // Update old Entry $link_id = @mysql_result(mysql_query("SELECT id FROM link WHERE url = '$link'"), 0); $sql = mysql_query("UPDATE link SET count = (count + 1), last_seen = '$date' WHERE url = '$link'"); } // Insert Tweet // $sql = mysql_query("INSERT INTO tweets (tweet, date, link_id, username) VALUES ('$twt', '$date', '$link_id', '$user')"); $tweet_id = mysql_insert_id(); // Create relationship $sql = mysql_query("INSERT INTO relationships (username, tweet_id, link_id) VALUES ('$user', '$tweet_id', '$link_id')"); } } mysql_query("DELETE FROM raw_data WHERE id = '$row_id'"); } // end foreach Quote Link to comment https://forums.phpfreaks.com/topic/195801-need-help-optimizing-these-queries/ Share on other sites More sharing options...
GKWelding Posted March 19, 2010 Share Posted March 19, 2010 Try this, you should see a small improvement in the rate. Using mysql_result is slower than using mysql_fetch_row, mysql_fetch_array and mysql_fetch_assoc. Also, try not to do SELECT * if you can get away with it. If this helps can you post back here with the increase in rate as I'm very interested. $get_data = mysql_query('SELECT id, tweets FROM raw_data WHERE processed = \'0\' ORDER BY `raw_data`.`id` ASC') or trigger_error('RawDataQueryFailed: ' . mysql_error()); while($data = mysql_fetch_array($get_data)){ // continously fetch data $row_id = $data['id']; $tweet = json_decode($data['tweets']); if ($tweet->user->screen_name != '' && $tweet->text != ''){ $date = date('U'); $user = mysql_real_escape_string($tweet->user->screen_name); $twt = mysql_real_escape_string($tweet->text); $prof_img = $tweet->user->profile_image_url; echo 'Username: '.$tweet->user->screen_name.'<br>'; echo 'Tweet: '.$tweet->text.'<br>'; echo 'Date: '.$date.'<br>'; echo 'Link: '.$link.'<br>'; echo '<img src="'.$tweet->user->profile_image_url.'">'; echo '<hr>'; // insert data into db if ($link != 'not found'){ // Check if user exists in db // $usr_chk = mysql_query('SELECT id FROM users WHERE username = '.$user); if (mysql_num_rows($usr_chk) == '0'){ // Create new user $sql = mysql_query('INSERT INTO users (username, profile_img) VALUES ('.$user.', '.$prof_img.')'); $u_id = mysql_insert_id(); } else { $result = mysql_query('SELECT id FROM users WHERE username = '.$user.' LIMIT 1'); if($result) { $u_id = mysql_fetch_row($result); $u_id = $u_id['id']; $sql = mysql_query('UPDATE users SET profile_img = '.$prof_img.' WHERE id = '.$u_id); } } // Record link // $sql = mysql_query('SELECT id FROM link WHERE url = '.$link.' LIMIT 1'); if (mysql_num_rows($sql) == '0'){ // Create new entry $sql = mysql_query('INSERT INTO link (url, count, long_url, first_seen, last_seen) VALUES ('.$link.', \'1\', '.$long_url.', '.$date.', '.$date.')'); $link_id = mysql_insert_id(); } else { // Update old Entry $result = mysql_query('SELECT id FROM link WHERE url = '.$link.' LIMIT 1'); if($result) { $link_id = mysql_fetch_row($result); $link_id = $link_id['id']; $sql = mysql_query('UPDATE link SET count = (count + 1), last_seen = '.$date.' WHERE url = '.$link); } } // Insert Tweet // $sql = mysql_query('INSERT INTO tweets (tweet, date, link_id, username) VALUES ('.$twt.', '.$date.', '.$link_id.', '.$user.')'); $tweet_id = mysql_insert_id(); // Create relationship $sql = mysql_query('INSERT INTO relationships (username, tweet_id, link_id) VALUES ('.$user.', '.$tweet_id.', '.$link_id.')'); } } mysql_query('DELETE FROM raw_data WHERE id = '.$row_id.' LIMIT 1'); } // end foreach Quote Link to comment https://forums.phpfreaks.com/topic/195801-need-help-optimizing-these-queries/#findComment-1028677 Share on other sites More sharing options...
wee493 Posted March 19, 2010 Author Share Posted March 19, 2010 surprisingly it actually seems to be going about 50 rows a minute slower. I'm also looking into why it's not inserting into the links row anymore. What i'm doing is saving the json data of tweets (from twitter) into a database then using a separate process to decode them and insert them into readable rows. Im receiving almost 1,500 tweets a minute and only decoding around ~350-400 a min. How much do you think it would help if I had stored the json data (Reviving 1,500 tweets a min) in a separate database & server. There would still be a majority of the stress on one database, but the second database would just be having insert queries. Quote Link to comment https://forums.phpfreaks.com/topic/195801-need-help-optimizing-these-queries/#findComment-1028773 Share on other sites More sharing options...
PFMaBiSmAd Posted March 19, 2010 Share Posted March 19, 2010 Edit: The reason the code just posted is slower is because it actually executes more queries, not less. Actually, your code was already executing the extra query that I saw. There are several things you can do that would speed up the code. 1) Change mysql_fetch_array() to mysql_fetch_assoc() 2) Don't use $date = date('U'); in your php code. Use one of the mysql functions directly in your query to get the date/time in the format you want. If you are truly using a Unix Timestamp, see the mysql UNIX_TIMESTAMP() function. 3) Don't echo data to the screen or echo a minimum of it. 4) The // Check if user exists in db // block of code can be replaced by a single INSERT ... ON DUPLICATE KEY UPDATE query. The username column should be a unique index. 5) The // Record link // block of code can also be replaced by a single INSERT ... ON DUPLICATE KEY UPDATE query. The url column should be a unique index. 6) Remove any @ in your code. They actually cause extra code to be executed even if there are no errors and you should simply have display_errors set to OFF anyway so there is no need to put any @ in any code. 7) Is your raw_date id column a primary key? Here is the INSERT syntax showing the ON DUPLICATE KEY UPDATE syntax - INSERT [LOW_PRIORITY | DELAYED | HIGH_PRIORITY] [iGNORE] [iNTO] tbl_name [(col_name,...)] {VALUES | VALUE} ({expr | DEFAULT},...),(...),... [ ON DUPLICATE KEY UPDATE col_name=expr [, col_name=expr] ... ] Quote Link to comment https://forums.phpfreaks.com/topic/195801-need-help-optimizing-these-queries/#findComment-1028780 Share on other sites More sharing options...
PFMaBiSmAd Posted March 19, 2010 Share Posted March 19, 2010 The following is the equivalent (untested, use at your own risk) code to what you posted above - <?php // the following query does not seem to need an ORDER BY term and assuming that the processed column is an INT remove the single quotes around the zero value. $get_data = mysql_query("SELECT * FROM raw_data WHERE processed = 0") or trigger_error('RawDataQueryFailed: ' . mysql_error()); while($data = mysql_fetch_assoc($get_data)){ // continously fetch data // use the $data variable directly // $row_id = $data['id']; $tweet = json_decode($data['tweets']); if ($tweet->user->screen_name != '' && $tweet->text != ''){ $date = time(); // use the time() function. Since this value is used in more than one query, fix the value at this point $user = mysql_real_escape_string($tweet->user->screen_name); $twt = mysql_real_escape_string($tweet->text); $prof_img = mysql_real_escape_string($tweet->user->profile_image_url); echo 'Username: '.$tweet->user->screen_name.'<br>'; echo 'Tweet: '.$tweet->text.'<br>'; //echo 'Date: '.$date.'<br>'; // link is not set in this code // echo 'Link: '.$link.'<br>'; //echo '<img src="'.$tweet->user->profile_image_url.'">'; // this actaully causes the browser to fetch the image echo '<hr>'; // insert data into db // link is not being set in the posted code so every access of it slows php down since it must still handle the error $link = ''; if ($link != 'not found'){ // Check if user exists in db // $query = "INSERT INTO users (username, profile_img) VALUES ('$user', '$prof_img') ON DUPLICATE KEY UPDATE profile_img = '$prof_img', id=LAST_INSERT_ID(id)"; mysql_query($query); // the $u_id variable was not used in this code // $u_id = mysql_insert_id(); // Record link // $query = "INSERT INTO link (url, count, long_url, first_seen, last_seen) VALUES ('$link', 1, '$long_url', $date, $date) ON DUPLICATE KEY UPDATE count = (count + 1), last_seen = $date, id=LAST_INSERT_ID(id)"; mysql_query($query); $link_id = mysql_insert_id(); // get the last id from the insert or from the update // Insert Tweet // $sql = mysql_query("INSERT INTO tweets (tweet, date, link_id, username) VALUES ('$twt', $date, $link_id, '$user')"); $tweet_id = mysql_insert_id(); // Create relationship $sql = mysql_query("INSERT INTO relationships (username, tweet_id, link_id) VALUES ('$user', $tweet_id, $link_id)"); } } mysql_query("DELETE FROM raw_data WHERE id = {$data['id']}"); } // end while loop ?> You have some undefined/unset variables in the code that probably account for any symptoms like it not inserting into the links row... that you would need to troubleshoot (i.e. we don't know where you intended to get the missing data from.) I kept the $date variable (changed it to use the faster time() function since it is used in more than one query.) I also made an assumption about the auto-increment column name 'id', in order to satisfy the LAST_INSERT_ID() function in the query (needed so that the UPDATE query will also set the mysql_insert_id() value.) Quote Link to comment https://forums.phpfreaks.com/topic/195801-need-help-optimizing-these-queries/#findComment-1028797 Share on other sites More sharing options...
wee493 Posted March 20, 2010 Author Share Posted March 20, 2010 Awesome thanks for all the help! I have optimized it enough that highest I have achieved is 2,000 tweets in a minute. I have used much of PFMaBiSmAds code and here are my queries $user = mysql_query("INSERT INTO users (username, profile_img) VALUES ('$user', '$prof_img') ON DUPLICATE KEY UPDATE profile_img = '$prof_img'"); // Record link // $rec_link = mysql_query("INSERT INTO link (url, count, long_url, first_seen, last_seen) VALUES ('$link', '1', '$long_url', '$date', '$date') ON DUPLICATE KEY UPDATE count = (count + 1), last_seen = '$today'"); // Insert Tweet // $ins_tweet = mysql_query("INSERT INTO tweets (tweet, tweet_id, date, username, sfsdfg) VALUES ('$twt', '$tweet_id', '$date', '$user', '$today')"); // Create relationship $relation = mysql_query("INSERT INTO relationships (username, rel_id, link) VALUES ('$user', '$rel_id', '$link')"); Now my problem is im processing them faster then im getting them. Does anyone have any suggestions for this? Currently I have set it up as a function and have a for loop to continue running the script and within the for loop it checks if there are less than 1,000 tweets to process. if there are less than 1,000 it sleeps for 45 seconds Quote Link to comment https://forums.phpfreaks.com/topic/195801-need-help-optimizing-these-queries/#findComment-1028927 Share on other sites More sharing options...
PFMaBiSmAd Posted March 20, 2010 Share Posted March 20, 2010 How are you invoking the current script? Sounds like you have 'forked' a background process and the script runs continuously in a loop? You could use a cron job/scheduled task to invoke the script once a minute (one minute is often the smallest cron period you can choose.) This would eliminate extra code and you could just process any waiting records, which would prevent any records from being 'stuck' in the queue. You might also consider using a single script. The script that receives the original data could just process it as it is received. If the script that is processing the original data is running on the same server as the script you have shown in this thread, a single script would actually reduce the amount of processing time used and the single script would have a greater through-put than the current arrangement. Quote Link to comment https://forums.phpfreaks.com/topic/195801-need-help-optimizing-these-queries/#findComment-1029021 Share on other sites More sharing options...
Ruzzas Posted March 20, 2010 Share Posted March 20, 2010 How are you invoking the current script? Sounds like you have 'forked' a background process and the script runs continuously in a loop? You could use a cron job/scheduled task to invoke the script once a minute (one minute is often the smallest cron period you can choose.) This would eliminate extra code and you could just process any waiting records, which would prevent any records from being 'stuck' in the queue. You might also consider using a single script. The script that receives the original data could just process it as it is received. If the script that is processing the original data is running on the same server as the script you have shown in this thread, a single script would actually reduce the amount of processing time used and the single script would have a greater through-put than the current arrangement. Indeed you are correct, running a script every minute or afew minutes is what you need, unless you can make a timer in perl which runs the script in a loop Quote Link to comment https://forums.phpfreaks.com/topic/195801-need-help-optimizing-these-queries/#findComment-1029022 Share on other sites More sharing options...
Recommended Posts
Join the conversation
You can post now and register later. If you have an account, sign in now to post with your account.