Get all the URLs using multi curl
I'm working on an app that gets all the URLs from an array of sites and displays it in array form or JSON.
I can do it using for loop, the problem is the execution time when I tried 10 URLs it gives me an error saying exceeds maximum execution time
.
Upon searching I found this multi curl
I also found this Fast PHP CURL Multiple Requests: Retrieve the content of multiple URLs using CURL. I tried to add my code but didn't work because I don't how to use the function.
Hope you help me.
Thanks.
This is my sample code.
<?php
$urls=array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/');
$mh = curl_multi_init();
foreach ($urls as $i => $url) {
$urlContent = file_get_contents($url);
$dom = new DOMDocument();
@$dom->loadHTML($urlContent);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if(!filter_var($url, FILTER_VALIDATE_URL) === false){
echo '<a href="'.$url.'">'.$url.'</a><br />';
}
}
$conn[$i]=curl_init($url);
$fp[$i]=fopen ($g, "w");
curl_setopt ($conn[$i], CURLOPT_FILE, $fp[$i]);
curl_setopt ($conn[$i], CURLOPT_HEADER ,0);
curl_setopt($conn[$i],CURLOPT_CONNECTTIMEOUT,60);
curl_multi_add_handle ($mh,$conn[$i]);
}
do {
$n=curl_multi_exec($mh,$active);
}
while ($active);
foreach ($urls as $i => $url) {
curl_multi_remove_handle($mh,$conn[$i]);
curl_close($conn[$i]);
fclose ($fp[$i]);
}
curl_multi_close($mh);
?>
php curl curl-multi
|
show 2 more comments
I'm working on an app that gets all the URLs from an array of sites and displays it in array form or JSON.
I can do it using for loop, the problem is the execution time when I tried 10 URLs it gives me an error saying exceeds maximum execution time
.
Upon searching I found this multi curl
I also found this Fast PHP CURL Multiple Requests: Retrieve the content of multiple URLs using CURL. I tried to add my code but didn't work because I don't how to use the function.
Hope you help me.
Thanks.
This is my sample code.
<?php
$urls=array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/');
$mh = curl_multi_init();
foreach ($urls as $i => $url) {
$urlContent = file_get_contents($url);
$dom = new DOMDocument();
@$dom->loadHTML($urlContent);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if(!filter_var($url, FILTER_VALIDATE_URL) === false){
echo '<a href="'.$url.'">'.$url.'</a><br />';
}
}
$conn[$i]=curl_init($url);
$fp[$i]=fopen ($g, "w");
curl_setopt ($conn[$i], CURLOPT_FILE, $fp[$i]);
curl_setopt ($conn[$i], CURLOPT_HEADER ,0);
curl_setopt($conn[$i],CURLOPT_CONNECTTIMEOUT,60);
curl_multi_add_handle ($mh,$conn[$i]);
}
do {
$n=curl_multi_exec($mh,$active);
}
while ($active);
foreach ($urls as $i => $url) {
curl_multi_remove_handle($mh,$conn[$i]);
curl_close($conn[$i]);
fclose ($fp[$i]);
}
curl_multi_close($mh);
?>
php curl curl-multi
"I tried to add my code but didn't work" What does "didn't work" mean? White page? Getting the wrong urls? Any error on screen? Or in your logs?
– kerbholz
Jan 2 at 8:52
Your problem most likely is not with the curl but with your do/while running (by some reason) forever... Try debugging that possibility.
– Carlos Alves Jorge
Jan 2 at 8:54
@kerbholz no sir, I cant use the the function properly
– user123
Jan 2 at 8:54
If you make it faster it'll hit execution time limit after, say, 50 urls.
– Salman A
Jan 4 at 23:30
What more have to you tried in these 4 days please share?
– Vickrant
Jan 7 at 12:35
|
show 2 more comments
I'm working on an app that gets all the URLs from an array of sites and displays it in array form or JSON.
I can do it using for loop, the problem is the execution time when I tried 10 URLs it gives me an error saying exceeds maximum execution time
.
Upon searching I found this multi curl
I also found this Fast PHP CURL Multiple Requests: Retrieve the content of multiple URLs using CURL. I tried to add my code but didn't work because I don't how to use the function.
Hope you help me.
Thanks.
This is my sample code.
<?php
$urls=array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/');
$mh = curl_multi_init();
foreach ($urls as $i => $url) {
$urlContent = file_get_contents($url);
$dom = new DOMDocument();
@$dom->loadHTML($urlContent);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if(!filter_var($url, FILTER_VALIDATE_URL) === false){
echo '<a href="'.$url.'">'.$url.'</a><br />';
}
}
$conn[$i]=curl_init($url);
$fp[$i]=fopen ($g, "w");
curl_setopt ($conn[$i], CURLOPT_FILE, $fp[$i]);
curl_setopt ($conn[$i], CURLOPT_HEADER ,0);
curl_setopt($conn[$i],CURLOPT_CONNECTTIMEOUT,60);
curl_multi_add_handle ($mh,$conn[$i]);
}
do {
$n=curl_multi_exec($mh,$active);
}
while ($active);
foreach ($urls as $i => $url) {
curl_multi_remove_handle($mh,$conn[$i]);
curl_close($conn[$i]);
fclose ($fp[$i]);
}
curl_multi_close($mh);
?>
php curl curl-multi
I'm working on an app that gets all the URLs from an array of sites and displays it in array form or JSON.
I can do it using for loop, the problem is the execution time when I tried 10 URLs it gives me an error saying exceeds maximum execution time
.
Upon searching I found this multi curl
I also found this Fast PHP CURL Multiple Requests: Retrieve the content of multiple URLs using CURL. I tried to add my code but didn't work because I don't how to use the function.
Hope you help me.
Thanks.
This is my sample code.
<?php
$urls=array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/');
$mh = curl_multi_init();
foreach ($urls as $i => $url) {
$urlContent = file_get_contents($url);
$dom = new DOMDocument();
@$dom->loadHTML($urlContent);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if(!filter_var($url, FILTER_VALIDATE_URL) === false){
echo '<a href="'.$url.'">'.$url.'</a><br />';
}
}
$conn[$i]=curl_init($url);
$fp[$i]=fopen ($g, "w");
curl_setopt ($conn[$i], CURLOPT_FILE, $fp[$i]);
curl_setopt ($conn[$i], CURLOPT_HEADER ,0);
curl_setopt($conn[$i],CURLOPT_CONNECTTIMEOUT,60);
curl_multi_add_handle ($mh,$conn[$i]);
}
do {
$n=curl_multi_exec($mh,$active);
}
while ($active);
foreach ($urls as $i => $url) {
curl_multi_remove_handle($mh,$conn[$i]);
curl_close($conn[$i]);
fclose ($fp[$i]);
}
curl_multi_close($mh);
?>
php curl curl-multi
php curl curl-multi
edited Jan 3 at 1:47
user123
asked Jan 2 at 8:49
user123user123
9214
9214
"I tried to add my code but didn't work" What does "didn't work" mean? White page? Getting the wrong urls? Any error on screen? Or in your logs?
– kerbholz
Jan 2 at 8:52
Your problem most likely is not with the curl but with your do/while running (by some reason) forever... Try debugging that possibility.
– Carlos Alves Jorge
Jan 2 at 8:54
@kerbholz no sir, I cant use the the function properly
– user123
Jan 2 at 8:54
If you make it faster it'll hit execution time limit after, say, 50 urls.
– Salman A
Jan 4 at 23:30
What more have to you tried in these 4 days please share?
– Vickrant
Jan 7 at 12:35
|
show 2 more comments
"I tried to add my code but didn't work" What does "didn't work" mean? White page? Getting the wrong urls? Any error on screen? Or in your logs?
– kerbholz
Jan 2 at 8:52
Your problem most likely is not with the curl but with your do/while running (by some reason) forever... Try debugging that possibility.
– Carlos Alves Jorge
Jan 2 at 8:54
@kerbholz no sir, I cant use the the function properly
– user123
Jan 2 at 8:54
If you make it faster it'll hit execution time limit after, say, 50 urls.
– Salman A
Jan 4 at 23:30
What more have to you tried in these 4 days please share?
– Vickrant
Jan 7 at 12:35
"I tried to add my code but didn't work" What does "didn't work" mean? White page? Getting the wrong urls? Any error on screen? Or in your logs?
– kerbholz
Jan 2 at 8:52
"I tried to add my code but didn't work" What does "didn't work" mean? White page? Getting the wrong urls? Any error on screen? Or in your logs?
– kerbholz
Jan 2 at 8:52
Your problem most likely is not with the curl but with your do/while running (by some reason) forever... Try debugging that possibility.
– Carlos Alves Jorge
Jan 2 at 8:54
Your problem most likely is not with the curl but with your do/while running (by some reason) forever... Try debugging that possibility.
– Carlos Alves Jorge
Jan 2 at 8:54
@kerbholz no sir, I cant use the the function properly
– user123
Jan 2 at 8:54
@kerbholz no sir, I cant use the the function properly
– user123
Jan 2 at 8:54
If you make it faster it'll hit execution time limit after, say, 50 urls.
– Salman A
Jan 4 at 23:30
If you make it faster it'll hit execution time limit after, say, 50 urls.
– Salman A
Jan 4 at 23:30
What more have to you tried in these 4 days please share?
– Vickrant
Jan 7 at 12:35
What more have to you tried in these 4 days please share?
– Vickrant
Jan 7 at 12:35
|
show 2 more comments
6 Answers
6
active
oldest
votes
Here is a function that I put together that will properly utilize the curl_multi_init()
function. It is more or less the same function that you will find on PHP.net with some minor tweaks. I have had great success with this.
function multi_thread_curl($urlArray, $optionArray, $nThreads) {
//Group your urls into groups/threads.
$curlArray = array_chunk($urlArray, $nThreads, $preserve_keys = true);
//Iterate through each batch of urls.
$ch = 'ch_';
foreach($curlArray as $threads) {
//Create your cURL resources.
foreach($threads as $thread=>$value) {
${$ch . $thread} = curl_init();
curl_setopt_array(${$ch . $thread}, $optionArray); //Set your main curl options.
curl_setopt(${$ch . $thread}, CURLOPT_URL, $value); //Set url.
}
//Create the multiple cURL handler.
$mh = curl_multi_init();
//Add the handles.
foreach($threads as $thread=>$value) {
curl_multi_add_handle($mh, ${$ch . $thread});
}
$active = null;
//execute the handles.
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active && $mrc == CURLM_OK) {
if (curl_multi_select($mh) != -1) {
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
//Get your data and close the handles.
foreach($threads as $thread=>$value) {
$results[$thread] = curl_multi_getcontent(${$ch . $thread});
curl_multi_remove_handle($mh, ${$ch . $thread});
}
//Close the multi handle exec.
curl_multi_close($mh);
}
return $results;
}
//Add whatever options here. The CURLOPT_URL is left out intentionally.
//It will be added in later from the url array.
$optionArray = array(
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0',//Pick your user agent.
CURLOPT_RETURNTRANSFER => TRUE,
CURLOPT_TIMEOUT => 10
);
//Create an array of your urls.
$urlArray = array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/'
);
//Play around with this number and see what works best.
//This is how many urls it will try to do at one time.
$nThreads = 20;
//To use run the function.
$results = multi_thread_curl($urlArray, $optionArray, $nThreads);
Once this is complete you will have an array containing all of the html from your list of websites. It is at this point where I would loop through them and pull out all of the urls.
Like so:
foreach($results as $page){
$dom = new DOMDocument();
@$dom->loadHTML($page);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if(!filter_var($url, FILTER_VALIDATE_URL) === false){
echo '<a href="'.$url.'">'.$url.'</a><br />';
}
}
}
It is also worth keeping in the back of you head the ability to increase the run time of your script.
If your using a hosting service you may be restricted to something in the ball park of two minutes regardless of what you set your max execution time to. Just food for thought.
This is done by:
ini_set('max_execution_time', 120);
You can always try more time but you'll never know till you time it.
Hope it helps.
thanks, Ill check it.
– user123
Jan 9 at 7:28
Yup no problem, let me know if you have any issues.
– Joseph_J
Jan 9 at 7:37
I got a problemWarning: Use of undefined constant ch - assumed 'ch' (this will throw an Error in a future version of PHP)
this 3 lines${ch . $thread} = curl_init(); curl_setopt_array(${ch . $thread}, $optionArray); //Set your main curl options. curl_setopt(${ch . $thread}, CURLOPT_URL, $value); //Set url.
– user123
Jan 10 at 6:34
Thank you for pointing that out. I have addressed the issue. Please check updated answer.
– Joseph_J
Jan 10 at 7:58
I made some changes to the updated code. Use the current code. Sorry for the rapid changes.
– Joseph_J
Jan 10 at 8:17
|
show 4 more comments
You may be using an endless loop - if not, you can can increase the maximum execution time in php.ini or with:
ini_set('max_execution_time', 600); // 600 seconds = 10 minutes
There is a design flaw if you have to increase execution time to 10 minutes.
– Vickrant
Jan 7 at 12:35
add a comment |
First of all i know that OP does asking about multi_curl
but i just adding another alternative if the OP may changes his mind. What i do here is splitting the urls into many request so the cpu usage will not that big. If the OP still wants use multi_curl
maybe the PHP master here could gives more better solution.
<?php
$num = preg_replace('/[^0-9]/','',$_GET['num']);
$num = empty($num) ? 0 : $num;
$urls=array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/');
if(!empty($urls[$num]))
{
/* do your single curl stuff here and store its data here*/
/*now redirect to the next url. dont use header location redirect, it would ends up too many redirect error in browser*/
$next_url = !empty($urls[$num+1]) ? $urls[$num+1] : 'done';
echo '<html>
<head>
<meta http-equiv="refresh" content="0;url=http://yourcodedomain.com/yourpath/yourcode.php?num='.$next_url.'" />
</head>
<body>
<p>Fetching: '.$num+1.' / '.count($urls).'</p>
</body>
</html>';
}
elseif($_GET['num'] == 'done')
{
/*if all sites have been fetched, do something here*/
}
else
{
/*throws exception here*/
}
?>
add a comment |
i had same issue then i solved using usleep() this try and let me know
do {
usleep(10000);
$n=curl_multi_exec($mh,$active);
}
add a comment |
Try this simplified version:
$urls = [
'https://en.wikipedia.org/',
'https://secure.php.net/',
];
set_time_limit(0);
libxml_use_internal_errors(true);
$hrefs = ;
foreach ($urls as $url) {
$html = file_get_contents($url);
$doc = new DOMDocument;
$doc->loadHTML($html);
foreach ($doc->getElementsByTagName('a') as $link) {
$href = filter_var($link->getAttribute('href'), FILTER_SANITIZE_URL);
if (filter_var($href, FILTER_VALIDATE_URL)) {
echo "<a href='{$href}'>{$href}</a><br/>n";
}
}
}
add a comment |
This is what I achieved after working on the code, It worked but not sure if this is the best answer. Kindly check my code.
<?php
$array = array('https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/');
print_r (getUrls($array));
function getUrls($array) {
$arrUrl = array();
$arrList = array();
$url_count = count($array);
$curl_array = array();
$ch = curl_multi_init();
foreach($array as $count => $url) {
$curl_array[$count] = curl_init($url);
curl_setopt($curl_array[$count], CURLOPT_RETURNTRANSFER, true);
curl_multi_add_handle($ch, $curl_array[$count]);
}
do{
curl_multi_exec($ch, $exec);
curl_multi_select($ch,1);
}while($exec);
foreach($array as $count => $url) {
$arrUrl = array();
$urlContent = curl_multi_getcontent($curl_array[$count]);
$dom = new DOMDocument();
@$dom->loadHTML($urlContent);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if (filter_var($url, FILTER_VALIDATE_URL) !== false) {
if (strpos($url, 'mailto') === false) {
$arrUrl = $url;
}
}
}
array_push($arrList, array_unique($arrUrl));
}
foreach($array as $count => $url) {
curl_multi_remove_handle($ch, $curl_array[$count]);
}
curl_multi_close($ch);
foreach($array as $count => $url) {
curl_close($curl_array[$count]);
}
return $arrList;
}
Can you explain why I got down vote?
– user123
Jan 8 at 2:54
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54003466%2fget-all-the-urls-using-multi-curl%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
6 Answers
6
active
oldest
votes
6 Answers
6
active
oldest
votes
active
oldest
votes
active
oldest
votes
Here is a function that I put together that will properly utilize the curl_multi_init()
function. It is more or less the same function that you will find on PHP.net with some minor tweaks. I have had great success with this.
function multi_thread_curl($urlArray, $optionArray, $nThreads) {
//Group your urls into groups/threads.
$curlArray = array_chunk($urlArray, $nThreads, $preserve_keys = true);
//Iterate through each batch of urls.
$ch = 'ch_';
foreach($curlArray as $threads) {
//Create your cURL resources.
foreach($threads as $thread=>$value) {
${$ch . $thread} = curl_init();
curl_setopt_array(${$ch . $thread}, $optionArray); //Set your main curl options.
curl_setopt(${$ch . $thread}, CURLOPT_URL, $value); //Set url.
}
//Create the multiple cURL handler.
$mh = curl_multi_init();
//Add the handles.
foreach($threads as $thread=>$value) {
curl_multi_add_handle($mh, ${$ch . $thread});
}
$active = null;
//execute the handles.
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active && $mrc == CURLM_OK) {
if (curl_multi_select($mh) != -1) {
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
//Get your data and close the handles.
foreach($threads as $thread=>$value) {
$results[$thread] = curl_multi_getcontent(${$ch . $thread});
curl_multi_remove_handle($mh, ${$ch . $thread});
}
//Close the multi handle exec.
curl_multi_close($mh);
}
return $results;
}
//Add whatever options here. The CURLOPT_URL is left out intentionally.
//It will be added in later from the url array.
$optionArray = array(
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0',//Pick your user agent.
CURLOPT_RETURNTRANSFER => TRUE,
CURLOPT_TIMEOUT => 10
);
//Create an array of your urls.
$urlArray = array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/'
);
//Play around with this number and see what works best.
//This is how many urls it will try to do at one time.
$nThreads = 20;
//To use run the function.
$results = multi_thread_curl($urlArray, $optionArray, $nThreads);
Once this is complete you will have an array containing all of the html from your list of websites. It is at this point where I would loop through them and pull out all of the urls.
Like so:
foreach($results as $page){
$dom = new DOMDocument();
@$dom->loadHTML($page);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if(!filter_var($url, FILTER_VALIDATE_URL) === false){
echo '<a href="'.$url.'">'.$url.'</a><br />';
}
}
}
It is also worth keeping in the back of you head the ability to increase the run time of your script.
If your using a hosting service you may be restricted to something in the ball park of two minutes regardless of what you set your max execution time to. Just food for thought.
This is done by:
ini_set('max_execution_time', 120);
You can always try more time but you'll never know till you time it.
Hope it helps.
thanks, Ill check it.
– user123
Jan 9 at 7:28
Yup no problem, let me know if you have any issues.
– Joseph_J
Jan 9 at 7:37
I got a problemWarning: Use of undefined constant ch - assumed 'ch' (this will throw an Error in a future version of PHP)
this 3 lines${ch . $thread} = curl_init(); curl_setopt_array(${ch . $thread}, $optionArray); //Set your main curl options. curl_setopt(${ch . $thread}, CURLOPT_URL, $value); //Set url.
– user123
Jan 10 at 6:34
Thank you for pointing that out. I have addressed the issue. Please check updated answer.
– Joseph_J
Jan 10 at 7:58
I made some changes to the updated code. Use the current code. Sorry for the rapid changes.
– Joseph_J
Jan 10 at 8:17
|
show 4 more comments
Here is a function that I put together that will properly utilize the curl_multi_init()
function. It is more or less the same function that you will find on PHP.net with some minor tweaks. I have had great success with this.
function multi_thread_curl($urlArray, $optionArray, $nThreads) {
//Group your urls into groups/threads.
$curlArray = array_chunk($urlArray, $nThreads, $preserve_keys = true);
//Iterate through each batch of urls.
$ch = 'ch_';
foreach($curlArray as $threads) {
//Create your cURL resources.
foreach($threads as $thread=>$value) {
${$ch . $thread} = curl_init();
curl_setopt_array(${$ch . $thread}, $optionArray); //Set your main curl options.
curl_setopt(${$ch . $thread}, CURLOPT_URL, $value); //Set url.
}
//Create the multiple cURL handler.
$mh = curl_multi_init();
//Add the handles.
foreach($threads as $thread=>$value) {
curl_multi_add_handle($mh, ${$ch . $thread});
}
$active = null;
//execute the handles.
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active && $mrc == CURLM_OK) {
if (curl_multi_select($mh) != -1) {
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
//Get your data and close the handles.
foreach($threads as $thread=>$value) {
$results[$thread] = curl_multi_getcontent(${$ch . $thread});
curl_multi_remove_handle($mh, ${$ch . $thread});
}
//Close the multi handle exec.
curl_multi_close($mh);
}
return $results;
}
//Add whatever options here. The CURLOPT_URL is left out intentionally.
//It will be added in later from the url array.
$optionArray = array(
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0',//Pick your user agent.
CURLOPT_RETURNTRANSFER => TRUE,
CURLOPT_TIMEOUT => 10
);
//Create an array of your urls.
$urlArray = array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/'
);
//Play around with this number and see what works best.
//This is how many urls it will try to do at one time.
$nThreads = 20;
//To use run the function.
$results = multi_thread_curl($urlArray, $optionArray, $nThreads);
Once this is complete you will have an array containing all of the html from your list of websites. It is at this point where I would loop through them and pull out all of the urls.
Like so:
foreach($results as $page){
$dom = new DOMDocument();
@$dom->loadHTML($page);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if(!filter_var($url, FILTER_VALIDATE_URL) === false){
echo '<a href="'.$url.'">'.$url.'</a><br />';
}
}
}
It is also worth keeping in the back of you head the ability to increase the run time of your script.
If your using a hosting service you may be restricted to something in the ball park of two minutes regardless of what you set your max execution time to. Just food for thought.
This is done by:
ini_set('max_execution_time', 120);
You can always try more time but you'll never know till you time it.
Hope it helps.
thanks, Ill check it.
– user123
Jan 9 at 7:28
Yup no problem, let me know if you have any issues.
– Joseph_J
Jan 9 at 7:37
I got a problemWarning: Use of undefined constant ch - assumed 'ch' (this will throw an Error in a future version of PHP)
this 3 lines${ch . $thread} = curl_init(); curl_setopt_array(${ch . $thread}, $optionArray); //Set your main curl options. curl_setopt(${ch . $thread}, CURLOPT_URL, $value); //Set url.
– user123
Jan 10 at 6:34
Thank you for pointing that out. I have addressed the issue. Please check updated answer.
– Joseph_J
Jan 10 at 7:58
I made some changes to the updated code. Use the current code. Sorry for the rapid changes.
– Joseph_J
Jan 10 at 8:17
|
show 4 more comments
Here is a function that I put together that will properly utilize the curl_multi_init()
function. It is more or less the same function that you will find on PHP.net with some minor tweaks. I have had great success with this.
function multi_thread_curl($urlArray, $optionArray, $nThreads) {
//Group your urls into groups/threads.
$curlArray = array_chunk($urlArray, $nThreads, $preserve_keys = true);
//Iterate through each batch of urls.
$ch = 'ch_';
foreach($curlArray as $threads) {
//Create your cURL resources.
foreach($threads as $thread=>$value) {
${$ch . $thread} = curl_init();
curl_setopt_array(${$ch . $thread}, $optionArray); //Set your main curl options.
curl_setopt(${$ch . $thread}, CURLOPT_URL, $value); //Set url.
}
//Create the multiple cURL handler.
$mh = curl_multi_init();
//Add the handles.
foreach($threads as $thread=>$value) {
curl_multi_add_handle($mh, ${$ch . $thread});
}
$active = null;
//execute the handles.
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active && $mrc == CURLM_OK) {
if (curl_multi_select($mh) != -1) {
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
//Get your data and close the handles.
foreach($threads as $thread=>$value) {
$results[$thread] = curl_multi_getcontent(${$ch . $thread});
curl_multi_remove_handle($mh, ${$ch . $thread});
}
//Close the multi handle exec.
curl_multi_close($mh);
}
return $results;
}
//Add whatever options here. The CURLOPT_URL is left out intentionally.
//It will be added in later from the url array.
$optionArray = array(
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0',//Pick your user agent.
CURLOPT_RETURNTRANSFER => TRUE,
CURLOPT_TIMEOUT => 10
);
//Create an array of your urls.
$urlArray = array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/'
);
//Play around with this number and see what works best.
//This is how many urls it will try to do at one time.
$nThreads = 20;
//To use run the function.
$results = multi_thread_curl($urlArray, $optionArray, $nThreads);
Once this is complete you will have an array containing all of the html from your list of websites. It is at this point where I would loop through them and pull out all of the urls.
Like so:
foreach($results as $page){
$dom = new DOMDocument();
@$dom->loadHTML($page);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if(!filter_var($url, FILTER_VALIDATE_URL) === false){
echo '<a href="'.$url.'">'.$url.'</a><br />';
}
}
}
It is also worth keeping in the back of you head the ability to increase the run time of your script.
If your using a hosting service you may be restricted to something in the ball park of two minutes regardless of what you set your max execution time to. Just food for thought.
This is done by:
ini_set('max_execution_time', 120);
You can always try more time but you'll never know till you time it.
Hope it helps.
Here is a function that I put together that will properly utilize the curl_multi_init()
function. It is more or less the same function that you will find on PHP.net with some minor tweaks. I have had great success with this.
function multi_thread_curl($urlArray, $optionArray, $nThreads) {
//Group your urls into groups/threads.
$curlArray = array_chunk($urlArray, $nThreads, $preserve_keys = true);
//Iterate through each batch of urls.
$ch = 'ch_';
foreach($curlArray as $threads) {
//Create your cURL resources.
foreach($threads as $thread=>$value) {
${$ch . $thread} = curl_init();
curl_setopt_array(${$ch . $thread}, $optionArray); //Set your main curl options.
curl_setopt(${$ch . $thread}, CURLOPT_URL, $value); //Set url.
}
//Create the multiple cURL handler.
$mh = curl_multi_init();
//Add the handles.
foreach($threads as $thread=>$value) {
curl_multi_add_handle($mh, ${$ch . $thread});
}
$active = null;
//execute the handles.
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active && $mrc == CURLM_OK) {
if (curl_multi_select($mh) != -1) {
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
//Get your data and close the handles.
foreach($threads as $thread=>$value) {
$results[$thread] = curl_multi_getcontent(${$ch . $thread});
curl_multi_remove_handle($mh, ${$ch . $thread});
}
//Close the multi handle exec.
curl_multi_close($mh);
}
return $results;
}
//Add whatever options here. The CURLOPT_URL is left out intentionally.
//It will be added in later from the url array.
$optionArray = array(
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0',//Pick your user agent.
CURLOPT_RETURNTRANSFER => TRUE,
CURLOPT_TIMEOUT => 10
);
//Create an array of your urls.
$urlArray = array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/'
);
//Play around with this number and see what works best.
//This is how many urls it will try to do at one time.
$nThreads = 20;
//To use run the function.
$results = multi_thread_curl($urlArray, $optionArray, $nThreads);
Once this is complete you will have an array containing all of the html from your list of websites. It is at this point where I would loop through them and pull out all of the urls.
Like so:
foreach($results as $page){
$dom = new DOMDocument();
@$dom->loadHTML($page);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if(!filter_var($url, FILTER_VALIDATE_URL) === false){
echo '<a href="'.$url.'">'.$url.'</a><br />';
}
}
}
It is also worth keeping in the back of you head the ability to increase the run time of your script.
If your using a hosting service you may be restricted to something in the ball park of two minutes regardless of what you set your max execution time to. Just food for thought.
This is done by:
ini_set('max_execution_time', 120);
You can always try more time but you'll never know till you time it.
Hope it helps.
edited Jan 10 at 10:08
answered Jan 9 at 7:24
Joseph_JJoseph_J
3,2732721
3,2732721
thanks, Ill check it.
– user123
Jan 9 at 7:28
Yup no problem, let me know if you have any issues.
– Joseph_J
Jan 9 at 7:37
I got a problemWarning: Use of undefined constant ch - assumed 'ch' (this will throw an Error in a future version of PHP)
this 3 lines${ch . $thread} = curl_init(); curl_setopt_array(${ch . $thread}, $optionArray); //Set your main curl options. curl_setopt(${ch . $thread}, CURLOPT_URL, $value); //Set url.
– user123
Jan 10 at 6:34
Thank you for pointing that out. I have addressed the issue. Please check updated answer.
– Joseph_J
Jan 10 at 7:58
I made some changes to the updated code. Use the current code. Sorry for the rapid changes.
– Joseph_J
Jan 10 at 8:17
|
show 4 more comments
thanks, Ill check it.
– user123
Jan 9 at 7:28
Yup no problem, let me know if you have any issues.
– Joseph_J
Jan 9 at 7:37
I got a problemWarning: Use of undefined constant ch - assumed 'ch' (this will throw an Error in a future version of PHP)
this 3 lines${ch . $thread} = curl_init(); curl_setopt_array(${ch . $thread}, $optionArray); //Set your main curl options. curl_setopt(${ch . $thread}, CURLOPT_URL, $value); //Set url.
– user123
Jan 10 at 6:34
Thank you for pointing that out. I have addressed the issue. Please check updated answer.
– Joseph_J
Jan 10 at 7:58
I made some changes to the updated code. Use the current code. Sorry for the rapid changes.
– Joseph_J
Jan 10 at 8:17
thanks, Ill check it.
– user123
Jan 9 at 7:28
thanks, Ill check it.
– user123
Jan 9 at 7:28
Yup no problem, let me know if you have any issues.
– Joseph_J
Jan 9 at 7:37
Yup no problem, let me know if you have any issues.
– Joseph_J
Jan 9 at 7:37
I got a problem
Warning: Use of undefined constant ch - assumed 'ch' (this will throw an Error in a future version of PHP)
this 3 lines ${ch . $thread} = curl_init(); curl_setopt_array(${ch . $thread}, $optionArray); //Set your main curl options. curl_setopt(${ch . $thread}, CURLOPT_URL, $value); //Set url.
– user123
Jan 10 at 6:34
I got a problem
Warning: Use of undefined constant ch - assumed 'ch' (this will throw an Error in a future version of PHP)
this 3 lines ${ch . $thread} = curl_init(); curl_setopt_array(${ch . $thread}, $optionArray); //Set your main curl options. curl_setopt(${ch . $thread}, CURLOPT_URL, $value); //Set url.
– user123
Jan 10 at 6:34
Thank you for pointing that out. I have addressed the issue. Please check updated answer.
– Joseph_J
Jan 10 at 7:58
Thank you for pointing that out. I have addressed the issue. Please check updated answer.
– Joseph_J
Jan 10 at 7:58
I made some changes to the updated code. Use the current code. Sorry for the rapid changes.
– Joseph_J
Jan 10 at 8:17
I made some changes to the updated code. Use the current code. Sorry for the rapid changes.
– Joseph_J
Jan 10 at 8:17
|
show 4 more comments
You may be using an endless loop - if not, you can can increase the maximum execution time in php.ini or with:
ini_set('max_execution_time', 600); // 600 seconds = 10 minutes
There is a design flaw if you have to increase execution time to 10 minutes.
– Vickrant
Jan 7 at 12:35
add a comment |
You may be using an endless loop - if not, you can can increase the maximum execution time in php.ini or with:
ini_set('max_execution_time', 600); // 600 seconds = 10 minutes
There is a design flaw if you have to increase execution time to 10 minutes.
– Vickrant
Jan 7 at 12:35
add a comment |
You may be using an endless loop - if not, you can can increase the maximum execution time in php.ini or with:
ini_set('max_execution_time', 600); // 600 seconds = 10 minutes
You may be using an endless loop - if not, you can can increase the maximum execution time in php.ini or with:
ini_set('max_execution_time', 600); // 600 seconds = 10 minutes
answered Jan 6 at 9:20
Kristoffer BohmannKristoffer Bohmann
2,82212133
2,82212133
There is a design flaw if you have to increase execution time to 10 minutes.
– Vickrant
Jan 7 at 12:35
add a comment |
There is a design flaw if you have to increase execution time to 10 minutes.
– Vickrant
Jan 7 at 12:35
There is a design flaw if you have to increase execution time to 10 minutes.
– Vickrant
Jan 7 at 12:35
There is a design flaw if you have to increase execution time to 10 minutes.
– Vickrant
Jan 7 at 12:35
add a comment |
First of all i know that OP does asking about multi_curl
but i just adding another alternative if the OP may changes his mind. What i do here is splitting the urls into many request so the cpu usage will not that big. If the OP still wants use multi_curl
maybe the PHP master here could gives more better solution.
<?php
$num = preg_replace('/[^0-9]/','',$_GET['num']);
$num = empty($num) ? 0 : $num;
$urls=array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/');
if(!empty($urls[$num]))
{
/* do your single curl stuff here and store its data here*/
/*now redirect to the next url. dont use header location redirect, it would ends up too many redirect error in browser*/
$next_url = !empty($urls[$num+1]) ? $urls[$num+1] : 'done';
echo '<html>
<head>
<meta http-equiv="refresh" content="0;url=http://yourcodedomain.com/yourpath/yourcode.php?num='.$next_url.'" />
</head>
<body>
<p>Fetching: '.$num+1.' / '.count($urls).'</p>
</body>
</html>';
}
elseif($_GET['num'] == 'done')
{
/*if all sites have been fetched, do something here*/
}
else
{
/*throws exception here*/
}
?>
add a comment |
First of all i know that OP does asking about multi_curl
but i just adding another alternative if the OP may changes his mind. What i do here is splitting the urls into many request so the cpu usage will not that big. If the OP still wants use multi_curl
maybe the PHP master here could gives more better solution.
<?php
$num = preg_replace('/[^0-9]/','',$_GET['num']);
$num = empty($num) ? 0 : $num;
$urls=array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/');
if(!empty($urls[$num]))
{
/* do your single curl stuff here and store its data here*/
/*now redirect to the next url. dont use header location redirect, it would ends up too many redirect error in browser*/
$next_url = !empty($urls[$num+1]) ? $urls[$num+1] : 'done';
echo '<html>
<head>
<meta http-equiv="refresh" content="0;url=http://yourcodedomain.com/yourpath/yourcode.php?num='.$next_url.'" />
</head>
<body>
<p>Fetching: '.$num+1.' / '.count($urls).'</p>
</body>
</html>';
}
elseif($_GET['num'] == 'done')
{
/*if all sites have been fetched, do something here*/
}
else
{
/*throws exception here*/
}
?>
add a comment |
First of all i know that OP does asking about multi_curl
but i just adding another alternative if the OP may changes his mind. What i do here is splitting the urls into many request so the cpu usage will not that big. If the OP still wants use multi_curl
maybe the PHP master here could gives more better solution.
<?php
$num = preg_replace('/[^0-9]/','',$_GET['num']);
$num = empty($num) ? 0 : $num;
$urls=array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/');
if(!empty($urls[$num]))
{
/* do your single curl stuff here and store its data here*/
/*now redirect to the next url. dont use header location redirect, it would ends up too many redirect error in browser*/
$next_url = !empty($urls[$num+1]) ? $urls[$num+1] : 'done';
echo '<html>
<head>
<meta http-equiv="refresh" content="0;url=http://yourcodedomain.com/yourpath/yourcode.php?num='.$next_url.'" />
</head>
<body>
<p>Fetching: '.$num+1.' / '.count($urls).'</p>
</body>
</html>';
}
elseif($_GET['num'] == 'done')
{
/*if all sites have been fetched, do something here*/
}
else
{
/*throws exception here*/
}
?>
First of all i know that OP does asking about multi_curl
but i just adding another alternative if the OP may changes his mind. What i do here is splitting the urls into many request so the cpu usage will not that big. If the OP still wants use multi_curl
maybe the PHP master here could gives more better solution.
<?php
$num = preg_replace('/[^0-9]/','',$_GET['num']);
$num = empty($num) ? 0 : $num;
$urls=array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/');
if(!empty($urls[$num]))
{
/* do your single curl stuff here and store its data here*/
/*now redirect to the next url. dont use header location redirect, it would ends up too many redirect error in browser*/
$next_url = !empty($urls[$num+1]) ? $urls[$num+1] : 'done';
echo '<html>
<head>
<meta http-equiv="refresh" content="0;url=http://yourcodedomain.com/yourpath/yourcode.php?num='.$next_url.'" />
</head>
<body>
<p>Fetching: '.$num+1.' / '.count($urls).'</p>
</body>
</html>';
}
elseif($_GET['num'] == 'done')
{
/*if all sites have been fetched, do something here*/
}
else
{
/*throws exception here*/
}
?>
edited Jan 8 at 3:42
answered Jan 8 at 1:46


Novel NouvelNovel Nouvel
11
11
add a comment |
add a comment |
i had same issue then i solved using usleep() this try and let me know
do {
usleep(10000);
$n=curl_multi_exec($mh,$active);
}
add a comment |
i had same issue then i solved using usleep() this try and let me know
do {
usleep(10000);
$n=curl_multi_exec($mh,$active);
}
add a comment |
i had same issue then i solved using usleep() this try and let me know
do {
usleep(10000);
$n=curl_multi_exec($mh,$active);
}
i had same issue then i solved using usleep() this try and let me know
do {
usleep(10000);
$n=curl_multi_exec($mh,$active);
}
answered Jan 10 at 8:26


HamelrajHamelraj
2,3352932
2,3352932
add a comment |
add a comment |
Try this simplified version:
$urls = [
'https://en.wikipedia.org/',
'https://secure.php.net/',
];
set_time_limit(0);
libxml_use_internal_errors(true);
$hrefs = ;
foreach ($urls as $url) {
$html = file_get_contents($url);
$doc = new DOMDocument;
$doc->loadHTML($html);
foreach ($doc->getElementsByTagName('a') as $link) {
$href = filter_var($link->getAttribute('href'), FILTER_SANITIZE_URL);
if (filter_var($href, FILTER_VALIDATE_URL)) {
echo "<a href='{$href}'>{$href}</a><br/>n";
}
}
}
add a comment |
Try this simplified version:
$urls = [
'https://en.wikipedia.org/',
'https://secure.php.net/',
];
set_time_limit(0);
libxml_use_internal_errors(true);
$hrefs = ;
foreach ($urls as $url) {
$html = file_get_contents($url);
$doc = new DOMDocument;
$doc->loadHTML($html);
foreach ($doc->getElementsByTagName('a') as $link) {
$href = filter_var($link->getAttribute('href'), FILTER_SANITIZE_URL);
if (filter_var($href, FILTER_VALIDATE_URL)) {
echo "<a href='{$href}'>{$href}</a><br/>n";
}
}
}
add a comment |
Try this simplified version:
$urls = [
'https://en.wikipedia.org/',
'https://secure.php.net/',
];
set_time_limit(0);
libxml_use_internal_errors(true);
$hrefs = ;
foreach ($urls as $url) {
$html = file_get_contents($url);
$doc = new DOMDocument;
$doc->loadHTML($html);
foreach ($doc->getElementsByTagName('a') as $link) {
$href = filter_var($link->getAttribute('href'), FILTER_SANITIZE_URL);
if (filter_var($href, FILTER_VALIDATE_URL)) {
echo "<a href='{$href}'>{$href}</a><br/>n";
}
}
}
Try this simplified version:
$urls = [
'https://en.wikipedia.org/',
'https://secure.php.net/',
];
set_time_limit(0);
libxml_use_internal_errors(true);
$hrefs = ;
foreach ($urls as $url) {
$html = file_get_contents($url);
$doc = new DOMDocument;
$doc->loadHTML($html);
foreach ($doc->getElementsByTagName('a') as $link) {
$href = filter_var($link->getAttribute('href'), FILTER_SANITIZE_URL);
if (filter_var($href, FILTER_VALIDATE_URL)) {
echo "<a href='{$href}'>{$href}</a><br/>n";
}
}
}
answered Jan 6 at 12:13


VictorVictor
3,64911923
3,64911923
add a comment |
add a comment |
This is what I achieved after working on the code, It worked but not sure if this is the best answer. Kindly check my code.
<?php
$array = array('https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/');
print_r (getUrls($array));
function getUrls($array) {
$arrUrl = array();
$arrList = array();
$url_count = count($array);
$curl_array = array();
$ch = curl_multi_init();
foreach($array as $count => $url) {
$curl_array[$count] = curl_init($url);
curl_setopt($curl_array[$count], CURLOPT_RETURNTRANSFER, true);
curl_multi_add_handle($ch, $curl_array[$count]);
}
do{
curl_multi_exec($ch, $exec);
curl_multi_select($ch,1);
}while($exec);
foreach($array as $count => $url) {
$arrUrl = array();
$urlContent = curl_multi_getcontent($curl_array[$count]);
$dom = new DOMDocument();
@$dom->loadHTML($urlContent);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if (filter_var($url, FILTER_VALIDATE_URL) !== false) {
if (strpos($url, 'mailto') === false) {
$arrUrl = $url;
}
}
}
array_push($arrList, array_unique($arrUrl));
}
foreach($array as $count => $url) {
curl_multi_remove_handle($ch, $curl_array[$count]);
}
curl_multi_close($ch);
foreach($array as $count => $url) {
curl_close($curl_array[$count]);
}
return $arrList;
}
Can you explain why I got down vote?
– user123
Jan 8 at 2:54
add a comment |
This is what I achieved after working on the code, It worked but not sure if this is the best answer. Kindly check my code.
<?php
$array = array('https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/');
print_r (getUrls($array));
function getUrls($array) {
$arrUrl = array();
$arrList = array();
$url_count = count($array);
$curl_array = array();
$ch = curl_multi_init();
foreach($array as $count => $url) {
$curl_array[$count] = curl_init($url);
curl_setopt($curl_array[$count], CURLOPT_RETURNTRANSFER, true);
curl_multi_add_handle($ch, $curl_array[$count]);
}
do{
curl_multi_exec($ch, $exec);
curl_multi_select($ch,1);
}while($exec);
foreach($array as $count => $url) {
$arrUrl = array();
$urlContent = curl_multi_getcontent($curl_array[$count]);
$dom = new DOMDocument();
@$dom->loadHTML($urlContent);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if (filter_var($url, FILTER_VALIDATE_URL) !== false) {
if (strpos($url, 'mailto') === false) {
$arrUrl = $url;
}
}
}
array_push($arrList, array_unique($arrUrl));
}
foreach($array as $count => $url) {
curl_multi_remove_handle($ch, $curl_array[$count]);
}
curl_multi_close($ch);
foreach($array as $count => $url) {
curl_close($curl_array[$count]);
}
return $arrList;
}
Can you explain why I got down vote?
– user123
Jan 8 at 2:54
add a comment |
This is what I achieved after working on the code, It worked but not sure if this is the best answer. Kindly check my code.
<?php
$array = array('https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/');
print_r (getUrls($array));
function getUrls($array) {
$arrUrl = array();
$arrList = array();
$url_count = count($array);
$curl_array = array();
$ch = curl_multi_init();
foreach($array as $count => $url) {
$curl_array[$count] = curl_init($url);
curl_setopt($curl_array[$count], CURLOPT_RETURNTRANSFER, true);
curl_multi_add_handle($ch, $curl_array[$count]);
}
do{
curl_multi_exec($ch, $exec);
curl_multi_select($ch,1);
}while($exec);
foreach($array as $count => $url) {
$arrUrl = array();
$urlContent = curl_multi_getcontent($curl_array[$count]);
$dom = new DOMDocument();
@$dom->loadHTML($urlContent);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if (filter_var($url, FILTER_VALIDATE_URL) !== false) {
if (strpos($url, 'mailto') === false) {
$arrUrl = $url;
}
}
}
array_push($arrList, array_unique($arrUrl));
}
foreach($array as $count => $url) {
curl_multi_remove_handle($ch, $curl_array[$count]);
}
curl_multi_close($ch);
foreach($array as $count => $url) {
curl_close($curl_array[$count]);
}
return $arrList;
}
This is what I achieved after working on the code, It worked but not sure if this is the best answer. Kindly check my code.
<?php
$array = array('https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/');
print_r (getUrls($array));
function getUrls($array) {
$arrUrl = array();
$arrList = array();
$url_count = count($array);
$curl_array = array();
$ch = curl_multi_init();
foreach($array as $count => $url) {
$curl_array[$count] = curl_init($url);
curl_setopt($curl_array[$count], CURLOPT_RETURNTRANSFER, true);
curl_multi_add_handle($ch, $curl_array[$count]);
}
do{
curl_multi_exec($ch, $exec);
curl_multi_select($ch,1);
}while($exec);
foreach($array as $count => $url) {
$arrUrl = array();
$urlContent = curl_multi_getcontent($curl_array[$count]);
$dom = new DOMDocument();
@$dom->loadHTML($urlContent);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if (filter_var($url, FILTER_VALIDATE_URL) !== false) {
if (strpos($url, 'mailto') === false) {
$arrUrl = $url;
}
}
}
array_push($arrList, array_unique($arrUrl));
}
foreach($array as $count => $url) {
curl_multi_remove_handle($ch, $curl_array[$count]);
}
curl_multi_close($ch);
foreach($array as $count => $url) {
curl_close($curl_array[$count]);
}
return $arrList;
}
answered Jan 8 at 2:49
user123user123
9214
9214
Can you explain why I got down vote?
– user123
Jan 8 at 2:54
add a comment |
Can you explain why I got down vote?
– user123
Jan 8 at 2:54
Can you explain why I got down vote?
– user123
Jan 8 at 2:54
Can you explain why I got down vote?
– user123
Jan 8 at 2:54
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54003466%2fget-all-the-urls-using-multi-curl%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
"I tried to add my code but didn't work" What does "didn't work" mean? White page? Getting the wrong urls? Any error on screen? Or in your logs?
– kerbholz
Jan 2 at 8:52
Your problem most likely is not with the curl but with your do/while running (by some reason) forever... Try debugging that possibility.
– Carlos Alves Jorge
Jan 2 at 8:54
@kerbholz no sir, I cant use the the function properly
– user123
Jan 2 at 8:54
If you make it faster it'll hit execution time limit after, say, 50 urls.
– Salman A
Jan 4 at 23:30
What more have to you tried in these 4 days please share?
– Vickrant
Jan 7 at 12:35