Regexp with differen t(unknown) string matching

I am trying to grab ALL HREF links using the GoColly framework but
only allow urls of ANY domain to be Root URL's or SubDomains(NO
paths). I've commented out my REGEXP. The file extensions doesn't
matter. I just don't want anything after a "/". I've tried using the
variable "Domain" and concat to build a "varied" regexp. Not even sure
if that's a thing.

Twitter.Com - TRUE

hello.authac.com - TRUE

whole.facebook.com -TRUE

twiiter.com/dd -FALSE

hello.authac.com/sd.html FALSE

whole.facebook.com/sdsd/dsd/as.txt FALSE

// Main

package main

import (

"log"

"fmt"

"time"

//"regexp"

"net/http"

"github.com/gocolly/colly"

)



var Target string

var Domain string





func main() {

//r := regexp.MustCompile("(https:" + Domain + ".com)$")





c := colly.NewCollector(

    //colly.URLFilters(r),



)











c.OnError(func(r *colly.Response, err error) {

    fmt.Println(r.Request.URL, "Is Not Reachable", r.StatusCode) 

})







// Find and visit all links

c.OnHTML("a", func(e *colly.HTMLElement) {

    e.Request.Visit(e.Attr("href"))

})





c.OnRequest(func(r *colly.Request) {

    Domain := r.URL.String()

    Target := BannerGrab(Domain)

    fmt.Println(Target)

    fmt.Println("Dropping By.. ", r.URL)

    time.Sleep(1000 * time.Millisecond)

})



c.Visit("https://www.twitter.com/")

}



//CheckDB if not listed else add

//RiskDB

//Email





func BannerGrab(s string) string {



client := &http.Client{}

req, err := http.NewRequest("GET", s, nil)

    if err != nil {

    log.Fatalln(err)

}

req.Header.Set("User-Agent", "Authac/0.1")

resp, _ := client.Do(req)

serverEntry := resp.Header.Get("Server")

return serverEntry



}

edited Nov 21 '18 at 15:53

Flimzy

38.7k106597

asked Nov 21 '18 at 14:01

Authac

Hello, you just want true or false matches based on the presence or absence of a "/", or you want to match anything before the "/" anyway, to extract the root URL?

– FMarazzi
Nov 21 '18 at 14:08

I want to match anything before the / to only allow domain grabbing of ROOT URL's. Your question is a little funky with how Go-Colly work but in a essence. Root URL's and subdomains should be the only thing that I get when println comes up.

– Authac
Nov 21 '18 at 15:04

add a comment |

Twitter.Com - TRUE

hello.authac.com - TRUE

whole.facebook.com -TRUE

twiiter.com/dd -FALSE

hello.authac.com/sd.html FALSE

whole.facebook.com/sdsd/dsd/as.txt FALSE

// Main

package main

import (

"log"

"fmt"

"time"

//"regexp"

"net/http"

"github.com/gocolly/colly"

)



var Target string

var Domain string





func main() {

//r := regexp.MustCompile("(https:" + Domain + ".com)$")





c := colly.NewCollector(

    //colly.URLFilters(r),



)











c.OnError(func(r *colly.Response, err error) {

    fmt.Println(r.Request.URL, "Is Not Reachable", r.StatusCode) 

})







// Find and visit all links

c.OnHTML("a", func(e *colly.HTMLElement) {

    e.Request.Visit(e.Attr("href"))

})





c.OnRequest(func(r *colly.Request) {

    Domain := r.URL.String()

    Target := BannerGrab(Domain)

    fmt.Println(Target)

    fmt.Println("Dropping By.. ", r.URL)

    time.Sleep(1000 * time.Millisecond)

})



c.Visit("https://www.twitter.com/")

}



//CheckDB if not listed else add

//RiskDB

//Email





func BannerGrab(s string) string {



client := &http.Client{}

req, err := http.NewRequest("GET", s, nil)

    if err != nil {

    log.Fatalln(err)

}

req.Header.Set("User-Agent", "Authac/0.1")

resp, _ := client.Do(req)

serverEntry := resp.Header.Get("Server")

return serverEntry



}

edited Nov 21 '18 at 15:53

Flimzy

38.7k106597

asked Nov 21 '18 at 14:01

Authac

Hello, you just want true or false matches based on the presence or absence of a "/", or you want to match anything before the "/" anyway, to extract the root URL?

– FMarazzi
Nov 21 '18 at 14:08

I want to match anything before the / to only allow domain grabbing of ROOT URL's. Your question is a little funky with how Go-Colly work but in a essence. Root URL's and subdomains should be the only thing that I get when println comes up.

– Authac
Nov 21 '18 at 15:04

add a comment |

Twitter.Com - TRUE

hello.authac.com - TRUE

whole.facebook.com -TRUE

twiiter.com/dd -FALSE

hello.authac.com/sd.html FALSE

whole.facebook.com/sdsd/dsd/as.txt FALSE

// Main

package main

import (

"log"

"fmt"

"time"

//"regexp"

"net/http"

"github.com/gocolly/colly"

)



var Target string

var Domain string





func main() {

//r := regexp.MustCompile("(https:" + Domain + ".com)$")





c := colly.NewCollector(

    //colly.URLFilters(r),



)











c.OnError(func(r *colly.Response, err error) {

    fmt.Println(r.Request.URL, "Is Not Reachable", r.StatusCode) 

})







// Find and visit all links

c.OnHTML("a", func(e *colly.HTMLElement) {

    e.Request.Visit(e.Attr("href"))

})





c.OnRequest(func(r *colly.Request) {

    Domain := r.URL.String()

    Target := BannerGrab(Domain)

    fmt.Println(Target)

    fmt.Println("Dropping By.. ", r.URL)

    time.Sleep(1000 * time.Millisecond)

})



c.Visit("https://www.twitter.com/")

}



//CheckDB if not listed else add

//RiskDB

//Email





func BannerGrab(s string) string {



client := &http.Client{}

req, err := http.NewRequest("GET", s, nil)

    if err != nil {

    log.Fatalln(err)

}

req.Header.Set("User-Agent", "Authac/0.1")

resp, _ := client.Do(req)

serverEntry := resp.Header.Get("Server")

return serverEntry



}

edited Nov 21 '18 at 15:53

Flimzy

38.7k106597

asked Nov 21 '18 at 14:01

Authac

Twitter.Com - TRUE

hello.authac.com - TRUE

whole.facebook.com -TRUE

twiiter.com/dd -FALSE

hello.authac.com/sd.html FALSE

whole.facebook.com/sdsd/dsd/as.txt FALSE

// Main

package main

import (

"log"

"fmt"

"time"

//"regexp"

"net/http"

"github.com/gocolly/colly"

)



var Target string

var Domain string





func main() {

//r := regexp.MustCompile("(https:" + Domain + ".com)$")





c := colly.NewCollector(

    //colly.URLFilters(r),



)











c.OnError(func(r *colly.Response, err error) {

    fmt.Println(r.Request.URL, "Is Not Reachable", r.StatusCode) 

})







// Find and visit all links

c.OnHTML("a", func(e *colly.HTMLElement) {

    e.Request.Visit(e.Attr("href"))

})





c.OnRequest(func(r *colly.Request) {

    Domain := r.URL.String()

    Target := BannerGrab(Domain)

    fmt.Println(Target)

    fmt.Println("Dropping By.. ", r.URL)

    time.Sleep(1000 * time.Millisecond)

})



c.Visit("https://www.twitter.com/")

}



//CheckDB if not listed else add

//RiskDB

//Email





func BannerGrab(s string) string {



client := &http.Client{}

req, err := http.NewRequest("GET", s, nil)

    if err != nil {

    log.Fatalln(err)

}

req.Header.Set("User-Agent", "Authac/0.1")

resp, _ := client.Do(req)

serverEntry := resp.Header.Get("Server")

return serverEntry



}

regex go

edited Nov 21 '18 at 15:53

Flimzy

38.7k106597

asked Nov 21 '18 at 14:01

Authac

edited Nov 21 '18 at 15:53

Flimzy

38.7k106597

asked Nov 21 '18 at 14:01

Authac

edited Nov 21 '18 at 15:53

Flimzy

38.7k106597

edited Nov 21 '18 at 15:53

Flimzy

38.7k106597

edited Nov 21 '18 at 15:53

Flimzy

38.7k106597

asked Nov 21 '18 at 14:01

Authac

asked Nov 21 '18 at 14:01

Authac

asked Nov 21 '18 at 14:01

Authac

Hello, you just want true or false matches based on the presence or absence of a "/", or you want to match anything before the "/" anyway, to extract the root URL?

– FMarazzi
Nov 21 '18 at 14:08

I want to match anything before the / to only allow domain grabbing of ROOT URL's. Your question is a little funky with how Go-Colly work but in a essence. Root URL's and subdomains should be the only thing that I get when println comes up.

– Authac
Nov 21 '18 at 15:04

add a comment |

Hello, you just want true or false matches based on the presence or absence of a "/", or you want to match anything before the "/" anyway, to extract the root URL?

– FMarazzi
Nov 21 '18 at 14:08

I want to match anything before the / to only allow domain grabbing of ROOT URL's. Your question is a little funky with how Go-Colly work but in a essence. Root URL's and subdomains should be the only thing that I get when println comes up.

– Authac
Nov 21 '18 at 15:04

Hello, you just want true or false matches based on the presence or absence of a "/", or you want to match anything before the "/" anyway, to extract the root URL?

– FMarazzi
Nov 21 '18 at 14:08

I want to match anything before the / to only allow domain grabbing of ROOT URL's. Your question is a little funky with how Go-Colly work but in a essence. Root URL's and subdomains should be the only thing that I get when println comes up.

– Authac
Nov 21 '18 at 15:04

add a comment |

2 Answers
2

active

oldest

votes

A possible regex is:

/ ^[^/\]+$ / gmi

Which does not match any time there is a "" or "/" in the text.

answered Nov 21 '18 at 14:11

FMarazzi

323213

golang regex's don't support flags like gmi, and don't require delimiters like /. Also this regex is just not going to work because https://twitter.com should be accepted as per OP's question

– Elias Van Ootegem
Nov 21 '18 at 17:24

Agreed, but he specified "I just don't want anything after a "/"."

– FMarazzi
Nov 21 '18 at 17:42

add a comment |

The bellow regex can match URLs with only domain and subdomain with no path,

r := regexp.MustCompile("(https|http)://(.*?)")

To add, colly.URLFilters will only crawl URL's matching the pattern. It will also not be creating URL's matching the pattern from the full URL and then do crawling.

answered Nov 21 '18 at 14:36

Jeevan

1326

The suggested example didn't work. Am I looking at colly.URLFilters and colly.Disallow backwards? It should only crawl root and subdomains with the above regex yet it stills traverse beyond that.

– Authac
Nov 21 '18 at 15:02

add a comment |

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53413766%2fregexp-with-differen-tunknown-string-matching%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

2 Answers
2

active

oldest

votes

2 Answers
2

active

oldest

votes

A possible regex is:

/ ^[^/\]+$ / gmi

Which does not match any time there is a "" or "/" in the text.

answered Nov 21 '18 at 14:11

FMarazzi

323213

golang regex's don't support flags like gmi, and don't require delimiters like /. Also this regex is just not going to work because https://twitter.com should be accepted as per OP's question

– Elias Van Ootegem
Nov 21 '18 at 17:24

Agreed, but he specified "I just don't want anything after a "/"."

– FMarazzi
Nov 21 '18 at 17:42

add a comment |

A possible regex is:

/ ^[^/\]+$ / gmi

Which does not match any time there is a "" or "/" in the text.

answered Nov 21 '18 at 14:11

FMarazzi

323213

golang regex's don't support flags like gmi, and don't require delimiters like /. Also this regex is just not going to work because https://twitter.com should be accepted as per OP's question

– Elias Van Ootegem
Nov 21 '18 at 17:24

Agreed, but he specified "I just don't want anything after a "/"."

– FMarazzi
Nov 21 '18 at 17:42

add a comment |

A possible regex is:

/ ^[^/\]+$ / gmi

Which does not match any time there is a "" or "/" in the text.

answered Nov 21 '18 at 14:11

FMarazzi

323213

A possible regex is:

/ ^[^/\]+$ / gmi

Which does not match any time there is a "" or "/" in the text.

answered Nov 21 '18 at 14:11

FMarazzi

323213

answered Nov 21 '18 at 14:11

FMarazzi

323213

answered Nov 21 '18 at 14:11

FMarazzi

323213

answered Nov 21 '18 at 14:11

FMarazzi

323213

golang regex's don't support flags like gmi, and don't require delimiters like /. Also this regex is just not going to work because https://twitter.com should be accepted as per OP's question

– Elias Van Ootegem
Nov 21 '18 at 17:24

Agreed, but he specified "I just don't want anything after a "/"."

– FMarazzi
Nov 21 '18 at 17:42

add a comment |

golang regex's don't support flags like gmi, and don't require delimiters like /. Also this regex is just not going to work because https://twitter.com should be accepted as per OP's question

– Elias Van Ootegem
Nov 21 '18 at 17:24

Agreed, but he specified "I just don't want anything after a "/"."

– FMarazzi
Nov 21 '18 at 17:42

golang regex's don't support flags like gmi, and don't require delimiters like /. Also this regex is just not going to work because https://twitter.com should be accepted as per OP's question

– Elias Van Ootegem
Nov 21 '18 at 17:24

Agreed, but he specified "I just don't want anything after a "/"."

– FMarazzi
Nov 21 '18 at 17:42

add a comment |

The bellow regex can match URLs with only domain and subdomain with no path,

r := regexp.MustCompile("(https|http)://(.*?)")

To add, colly.URLFilters will only crawl URL's matching the pattern. It will also not be creating URL's matching the pattern from the full URL and then do crawling.

answered Nov 21 '18 at 14:36

Jeevan

1326

The suggested example didn't work. Am I looking at colly.URLFilters and colly.Disallow backwards? It should only crawl root and subdomains with the above regex yet it stills traverse beyond that.

– Authac
Nov 21 '18 at 15:02

add a comment |

The bellow regex can match URLs with only domain and subdomain with no path,

r := regexp.MustCompile("(https|http)://(.*?)")

To add, colly.URLFilters will only crawl URL's matching the pattern. It will also not be creating URL's matching the pattern from the full URL and then do crawling.

answered Nov 21 '18 at 14:36

Jeevan

1326

The suggested example didn't work. Am I looking at colly.URLFilters and colly.Disallow backwards? It should only crawl root and subdomains with the above regex yet it stills traverse beyond that.

– Authac
Nov 21 '18 at 15:02

add a comment |

The bellow regex can match URLs with only domain and subdomain with no path,

r := regexp.MustCompile("(https|http)://(.*?)")

To add, colly.URLFilters will only crawl URL's matching the pattern. It will also not be creating URL's matching the pattern from the full URL and then do crawling.

answered Nov 21 '18 at 14:36

Jeevan

1326

The bellow regex can match URLs with only domain and subdomain with no path,

r := regexp.MustCompile("(https|http)://(.*?)")

To add, colly.URLFilters will only crawl URL's matching the pattern. It will also not be creating URL's matching the pattern from the full URL and then do crawling.

answered Nov 21 '18 at 14:36

Jeevan

1326

answered Nov 21 '18 at 14:36

Jeevan

1326

answered Nov 21 '18 at 14:36

Jeevan

1326

answered Nov 21 '18 at 14:36

Jeevan

1326

The suggested example didn't work. Am I looking at colly.URLFilters and colly.Disallow backwards? It should only crawl root and subdomains with the above regex yet it stills traverse beyond that.

– Authac
Nov 21 '18 at 15:02

add a comment |

The suggested example didn't work. Am I looking at colly.URLFilters and colly.Disallow backwards? It should only crawl root and subdomains with the above regex yet it stills traverse beyond that.

– Authac
Nov 21 '18 at 15:02

The suggested example didn't work. Am I looking at colly.URLFilters and colly.Disallow backwards? It should only crawl root and subdomains with the above regex yet it stills traverse beyond that.

– Authac
Nov 21 '18 at 15:02

add a comment |

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

Search This Blog

Ufyukyu