Regexp with differen t(unknown) string matching












0















I am trying to grab ALL HREF links using the GoColly framework but
only allow urls of ANY domain to be Root URL's or SubDomains(NO
paths). I've commented out my REGEXP. The file extensions doesn't
matter. I just don't want anything after a "/". I've tried using the
variable "Domain" and concat to build a "varied" regexp. Not even sure
if that's a thing.




  1. Twitter.Com - TRUE

  2. hello.authac.com - TRUE

  3. whole.facebook.com -TRUE

  4. twiiter.com/dd -FALSE

  5. hello.authac.com/sd.html FALSE


  6. whole.facebook.com/sdsd/dsd/as.txt FALSE



    // Main
    package main
    import (
    "log"
    "fmt"
    "time"
    //"regexp"
    "net/http"
    "github.com/gocolly/colly"
    )

    var Target string
    var Domain string


    func main() {
    //r := regexp.MustCompile("(https:" + Domain + ".com)$")


    c := colly.NewCollector(
    //colly.URLFilters(r),

    )





    c.OnError(func(r *colly.Response, err error) {
    fmt.Println(r.Request.URL, "Is Not Reachable", r.StatusCode)
    })



    // Find and visit all links
    c.OnHTML("a", func(e *colly.HTMLElement) {
    e.Request.Visit(e.Attr("href"))
    })


    c.OnRequest(func(r *colly.Request) {
    Domain := r.URL.String()
    Target := BannerGrab(Domain)
    fmt.Println(Target)
    fmt.Println("Dropping By.. ", r.URL)
    time.Sleep(1000 * time.Millisecond)
    })

    c.Visit("https://www.twitter.com/")
    }

    //CheckDB if not listed else add
    //RiskDB
    //Email


    func BannerGrab(s string) string {

    client := &http.Client{}
    req, err := http.NewRequest("GET", s, nil)
    if err != nil {
    log.Fatalln(err)
    }
    req.Header.Set("User-Agent", "Authac/0.1")
    resp, _ := client.Do(req)
    serverEntry := resp.Header.Get("Server")
    return serverEntry

    }











share|improve this question

























  • Hello, you just want true or false matches based on the presence or absence of a "/", or you want to match anything before the "/" anyway, to extract the root URL?

    – FMarazzi
    Nov 21 '18 at 14:08











  • I want to match anything before the / to only allow domain grabbing of ROOT URL's. Your question is a little funky with how Go-Colly work but in a essence. Root URL's and subdomains should be the only thing that I get when println comes up.

    – Authac
    Nov 21 '18 at 15:04
















0















I am trying to grab ALL HREF links using the GoColly framework but
only allow urls of ANY domain to be Root URL's or SubDomains(NO
paths). I've commented out my REGEXP. The file extensions doesn't
matter. I just don't want anything after a "/". I've tried using the
variable "Domain" and concat to build a "varied" regexp. Not even sure
if that's a thing.




  1. Twitter.Com - TRUE

  2. hello.authac.com - TRUE

  3. whole.facebook.com -TRUE

  4. twiiter.com/dd -FALSE

  5. hello.authac.com/sd.html FALSE


  6. whole.facebook.com/sdsd/dsd/as.txt FALSE



    // Main
    package main
    import (
    "log"
    "fmt"
    "time"
    //"regexp"
    "net/http"
    "github.com/gocolly/colly"
    )

    var Target string
    var Domain string


    func main() {
    //r := regexp.MustCompile("(https:" + Domain + ".com)$")


    c := colly.NewCollector(
    //colly.URLFilters(r),

    )





    c.OnError(func(r *colly.Response, err error) {
    fmt.Println(r.Request.URL, "Is Not Reachable", r.StatusCode)
    })



    // Find and visit all links
    c.OnHTML("a", func(e *colly.HTMLElement) {
    e.Request.Visit(e.Attr("href"))
    })


    c.OnRequest(func(r *colly.Request) {
    Domain := r.URL.String()
    Target := BannerGrab(Domain)
    fmt.Println(Target)
    fmt.Println("Dropping By.. ", r.URL)
    time.Sleep(1000 * time.Millisecond)
    })

    c.Visit("https://www.twitter.com/")
    }

    //CheckDB if not listed else add
    //RiskDB
    //Email


    func BannerGrab(s string) string {

    client := &http.Client{}
    req, err := http.NewRequest("GET", s, nil)
    if err != nil {
    log.Fatalln(err)
    }
    req.Header.Set("User-Agent", "Authac/0.1")
    resp, _ := client.Do(req)
    serverEntry := resp.Header.Get("Server")
    return serverEntry

    }











share|improve this question

























  • Hello, you just want true or false matches based on the presence or absence of a "/", or you want to match anything before the "/" anyway, to extract the root URL?

    – FMarazzi
    Nov 21 '18 at 14:08











  • I want to match anything before the / to only allow domain grabbing of ROOT URL's. Your question is a little funky with how Go-Colly work but in a essence. Root URL's and subdomains should be the only thing that I get when println comes up.

    – Authac
    Nov 21 '18 at 15:04














0












0








0








I am trying to grab ALL HREF links using the GoColly framework but
only allow urls of ANY domain to be Root URL's or SubDomains(NO
paths). I've commented out my REGEXP. The file extensions doesn't
matter. I just don't want anything after a "/". I've tried using the
variable "Domain" and concat to build a "varied" regexp. Not even sure
if that's a thing.




  1. Twitter.Com - TRUE

  2. hello.authac.com - TRUE

  3. whole.facebook.com -TRUE

  4. twiiter.com/dd -FALSE

  5. hello.authac.com/sd.html FALSE


  6. whole.facebook.com/sdsd/dsd/as.txt FALSE



    // Main
    package main
    import (
    "log"
    "fmt"
    "time"
    //"regexp"
    "net/http"
    "github.com/gocolly/colly"
    )

    var Target string
    var Domain string


    func main() {
    //r := regexp.MustCompile("(https:" + Domain + ".com)$")


    c := colly.NewCollector(
    //colly.URLFilters(r),

    )





    c.OnError(func(r *colly.Response, err error) {
    fmt.Println(r.Request.URL, "Is Not Reachable", r.StatusCode)
    })



    // Find and visit all links
    c.OnHTML("a", func(e *colly.HTMLElement) {
    e.Request.Visit(e.Attr("href"))
    })


    c.OnRequest(func(r *colly.Request) {
    Domain := r.URL.String()
    Target := BannerGrab(Domain)
    fmt.Println(Target)
    fmt.Println("Dropping By.. ", r.URL)
    time.Sleep(1000 * time.Millisecond)
    })

    c.Visit("https://www.twitter.com/")
    }

    //CheckDB if not listed else add
    //RiskDB
    //Email


    func BannerGrab(s string) string {

    client := &http.Client{}
    req, err := http.NewRequest("GET", s, nil)
    if err != nil {
    log.Fatalln(err)
    }
    req.Header.Set("User-Agent", "Authac/0.1")
    resp, _ := client.Do(req)
    serverEntry := resp.Header.Get("Server")
    return serverEntry

    }











share|improve this question
















I am trying to grab ALL HREF links using the GoColly framework but
only allow urls of ANY domain to be Root URL's or SubDomains(NO
paths). I've commented out my REGEXP. The file extensions doesn't
matter. I just don't want anything after a "/". I've tried using the
variable "Domain" and concat to build a "varied" regexp. Not even sure
if that's a thing.




  1. Twitter.Com - TRUE

  2. hello.authac.com - TRUE

  3. whole.facebook.com -TRUE

  4. twiiter.com/dd -FALSE

  5. hello.authac.com/sd.html FALSE


  6. whole.facebook.com/sdsd/dsd/as.txt FALSE



    // Main
    package main
    import (
    "log"
    "fmt"
    "time"
    //"regexp"
    "net/http"
    "github.com/gocolly/colly"
    )

    var Target string
    var Domain string


    func main() {
    //r := regexp.MustCompile("(https:" + Domain + ".com)$")


    c := colly.NewCollector(
    //colly.URLFilters(r),

    )





    c.OnError(func(r *colly.Response, err error) {
    fmt.Println(r.Request.URL, "Is Not Reachable", r.StatusCode)
    })



    // Find and visit all links
    c.OnHTML("a", func(e *colly.HTMLElement) {
    e.Request.Visit(e.Attr("href"))
    })


    c.OnRequest(func(r *colly.Request) {
    Domain := r.URL.String()
    Target := BannerGrab(Domain)
    fmt.Println(Target)
    fmt.Println("Dropping By.. ", r.URL)
    time.Sleep(1000 * time.Millisecond)
    })

    c.Visit("https://www.twitter.com/")
    }

    //CheckDB if not listed else add
    //RiskDB
    //Email


    func BannerGrab(s string) string {

    client := &http.Client{}
    req, err := http.NewRequest("GET", s, nil)
    if err != nil {
    log.Fatalln(err)
    }
    req.Header.Set("User-Agent", "Authac/0.1")
    resp, _ := client.Do(req)
    serverEntry := resp.Header.Get("Server")
    return serverEntry

    }








regex go






share|improve this question















share|improve this question













share|improve this question




share|improve this question








edited Nov 21 '18 at 15:53









Flimzy

38.7k106597




38.7k106597










asked Nov 21 '18 at 14:01









AuthacAuthac

76




76













  • Hello, you just want true or false matches based on the presence or absence of a "/", or you want to match anything before the "/" anyway, to extract the root URL?

    – FMarazzi
    Nov 21 '18 at 14:08











  • I want to match anything before the / to only allow domain grabbing of ROOT URL's. Your question is a little funky with how Go-Colly work but in a essence. Root URL's and subdomains should be the only thing that I get when println comes up.

    – Authac
    Nov 21 '18 at 15:04



















  • Hello, you just want true or false matches based on the presence or absence of a "/", or you want to match anything before the "/" anyway, to extract the root URL?

    – FMarazzi
    Nov 21 '18 at 14:08











  • I want to match anything before the / to only allow domain grabbing of ROOT URL's. Your question is a little funky with how Go-Colly work but in a essence. Root URL's and subdomains should be the only thing that I get when println comes up.

    – Authac
    Nov 21 '18 at 15:04

















Hello, you just want true or false matches based on the presence or absence of a "/", or you want to match anything before the "/" anyway, to extract the root URL?

– FMarazzi
Nov 21 '18 at 14:08





Hello, you just want true or false matches based on the presence or absence of a "/", or you want to match anything before the "/" anyway, to extract the root URL?

– FMarazzi
Nov 21 '18 at 14:08













I want to match anything before the / to only allow domain grabbing of ROOT URL's. Your question is a little funky with how Go-Colly work but in a essence. Root URL's and subdomains should be the only thing that I get when println comes up.

– Authac
Nov 21 '18 at 15:04





I want to match anything before the / to only allow domain grabbing of ROOT URL's. Your question is a little funky with how Go-Colly work but in a essence. Root URL's and subdomains should be the only thing that I get when println comes up.

– Authac
Nov 21 '18 at 15:04












2 Answers
2






active

oldest

votes


















0














A possible regex is:



/ ^[^/\]+$ / gmi


Which does not match any time there is a "" or "/" in the text.






share|improve this answer
























  • golang regex's don't support flags like gmi, and don't require delimiters like /. Also this regex is just not going to work because https://twitter.com should be accepted as per OP's question

    – Elias Van Ootegem
    Nov 21 '18 at 17:24











  • Agreed, but he specified "I just don't want anything after a "/"."

    – FMarazzi
    Nov 21 '18 at 17:42





















0














The bellow regex can match URLs with only domain and subdomain with no path,




r := regexp.MustCompile("(https|http)://(.*?)")




To add, colly.URLFilters will only crawl URL's matching the pattern. It will also not be creating URL's matching the pattern from the full URL and then do crawling.






share|improve this answer
























  • The suggested example didn't work. Am I looking at colly.URLFilters and colly.Disallow backwards? It should only crawl root and subdomains with the above regex yet it stills traverse beyond that.

    – Authac
    Nov 21 '18 at 15:02











Your Answer






StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});


}
});














draft saved

draft discarded


















StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53413766%2fregexp-with-differen-tunknown-string-matching%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown

























2 Answers
2






active

oldest

votes








2 Answers
2






active

oldest

votes









active

oldest

votes






active

oldest

votes









0














A possible regex is:



/ ^[^/\]+$ / gmi


Which does not match any time there is a "" or "/" in the text.






share|improve this answer
























  • golang regex's don't support flags like gmi, and don't require delimiters like /. Also this regex is just not going to work because https://twitter.com should be accepted as per OP's question

    – Elias Van Ootegem
    Nov 21 '18 at 17:24











  • Agreed, but he specified "I just don't want anything after a "/"."

    – FMarazzi
    Nov 21 '18 at 17:42


















0














A possible regex is:



/ ^[^/\]+$ / gmi


Which does not match any time there is a "" or "/" in the text.






share|improve this answer
























  • golang regex's don't support flags like gmi, and don't require delimiters like /. Also this regex is just not going to work because https://twitter.com should be accepted as per OP's question

    – Elias Van Ootegem
    Nov 21 '18 at 17:24











  • Agreed, but he specified "I just don't want anything after a "/"."

    – FMarazzi
    Nov 21 '18 at 17:42
















0












0








0







A possible regex is:



/ ^[^/\]+$ / gmi


Which does not match any time there is a "" or "/" in the text.






share|improve this answer













A possible regex is:



/ ^[^/\]+$ / gmi


Which does not match any time there is a "" or "/" in the text.







share|improve this answer












share|improve this answer



share|improve this answer










answered Nov 21 '18 at 14:11









FMarazziFMarazzi

323213




323213













  • golang regex's don't support flags like gmi, and don't require delimiters like /. Also this regex is just not going to work because https://twitter.com should be accepted as per OP's question

    – Elias Van Ootegem
    Nov 21 '18 at 17:24











  • Agreed, but he specified "I just don't want anything after a "/"."

    – FMarazzi
    Nov 21 '18 at 17:42





















  • golang regex's don't support flags like gmi, and don't require delimiters like /. Also this regex is just not going to work because https://twitter.com should be accepted as per OP's question

    – Elias Van Ootegem
    Nov 21 '18 at 17:24











  • Agreed, but he specified "I just don't want anything after a "/"."

    – FMarazzi
    Nov 21 '18 at 17:42



















golang regex's don't support flags like gmi, and don't require delimiters like /. Also this regex is just not going to work because https://twitter.com should be accepted as per OP's question

– Elias Van Ootegem
Nov 21 '18 at 17:24





golang regex's don't support flags like gmi, and don't require delimiters like /. Also this regex is just not going to work because https://twitter.com should be accepted as per OP's question

– Elias Van Ootegem
Nov 21 '18 at 17:24













Agreed, but he specified "I just don't want anything after a "/"."

– FMarazzi
Nov 21 '18 at 17:42







Agreed, but he specified "I just don't want anything after a "/"."

– FMarazzi
Nov 21 '18 at 17:42















0














The bellow regex can match URLs with only domain and subdomain with no path,




r := regexp.MustCompile("(https|http)://(.*?)")




To add, colly.URLFilters will only crawl URL's matching the pattern. It will also not be creating URL's matching the pattern from the full URL and then do crawling.






share|improve this answer
























  • The suggested example didn't work. Am I looking at colly.URLFilters and colly.Disallow backwards? It should only crawl root and subdomains with the above regex yet it stills traverse beyond that.

    – Authac
    Nov 21 '18 at 15:02
















0














The bellow regex can match URLs with only domain and subdomain with no path,




r := regexp.MustCompile("(https|http)://(.*?)")




To add, colly.URLFilters will only crawl URL's matching the pattern. It will also not be creating URL's matching the pattern from the full URL and then do crawling.






share|improve this answer
























  • The suggested example didn't work. Am I looking at colly.URLFilters and colly.Disallow backwards? It should only crawl root and subdomains with the above regex yet it stills traverse beyond that.

    – Authac
    Nov 21 '18 at 15:02














0












0








0







The bellow regex can match URLs with only domain and subdomain with no path,




r := regexp.MustCompile("(https|http)://(.*?)")




To add, colly.URLFilters will only crawl URL's matching the pattern. It will also not be creating URL's matching the pattern from the full URL and then do crawling.






share|improve this answer













The bellow regex can match URLs with only domain and subdomain with no path,




r := regexp.MustCompile("(https|http)://(.*?)")




To add, colly.URLFilters will only crawl URL's matching the pattern. It will also not be creating URL's matching the pattern from the full URL and then do crawling.







share|improve this answer












share|improve this answer



share|improve this answer










answered Nov 21 '18 at 14:36









JeevanJeevan

1326




1326













  • The suggested example didn't work. Am I looking at colly.URLFilters and colly.Disallow backwards? It should only crawl root and subdomains with the above regex yet it stills traverse beyond that.

    – Authac
    Nov 21 '18 at 15:02



















  • The suggested example didn't work. Am I looking at colly.URLFilters and colly.Disallow backwards? It should only crawl root and subdomains with the above regex yet it stills traverse beyond that.

    – Authac
    Nov 21 '18 at 15:02

















The suggested example didn't work. Am I looking at colly.URLFilters and colly.Disallow backwards? It should only crawl root and subdomains with the above regex yet it stills traverse beyond that.

– Authac
Nov 21 '18 at 15:02





The suggested example didn't work. Am I looking at colly.URLFilters and colly.Disallow backwards? It should only crawl root and subdomains with the above regex yet it stills traverse beyond that.

– Authac
Nov 21 '18 at 15:02


















draft saved

draft discarded




















































Thanks for contributing an answer to Stack Overflow!


  • Please be sure to answer the question. Provide details and share your research!

But avoid



  • Asking for help, clarification, or responding to other answers.

  • Making statements based on opinion; back them up with references or personal experience.


To learn more, see our tips on writing great answers.




draft saved


draft discarded














StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53413766%2fregexp-with-differen-tunknown-string-matching%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown





















































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown

































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown







Popular posts from this blog

MongoDB - Not Authorized To Execute Command

How to fix TextFormField cause rebuild widget in Flutter

in spring boot 2.1 many test slices are not allowed anymore due to multiple @BootstrapWith