From b6e3c49e63324eddd2e56fa7e918821e7a497d33 Mon Sep 17 00:00:00 2001 From: terminaldweller Date: Thu, 27 Jan 2022 10:44:16 +0330 Subject: updates --- bin/extractor | 86 +++++++++++++++++++++++++++++++---------------------------- 1 file changed, 45 insertions(+), 41 deletions(-) (limited to 'bin/extractor') diff --git a/bin/extractor b/bin/extractor index c6dc4e7..4b56d1b 100755 --- a/bin/extractor +++ b/bin/extractor @@ -1,9 +1,10 @@ #!/usr/bin/env lua ---by Egor Skriptunoff here:https://stackoverflow.com/questions/23590304/finding-a-url-in-a-string-lua-pattern +-- by Egor Skriptunoff here:https://stackoverflow.com/questions/23590304/finding-a-url-in-a-string-lua-pattern local extractor = {} function extractor.getURLS(text_with_URLs) - local domains = [[.ac.ad.ae.aero.af.ag.ai.al.am.an.ao.aq.ar.arpa.as.asia.at.au + local domains = + [[.ac.ad.ae.aero.af.ag.ai.al.am.an.ao.aq.ar.arpa.as.asia.at.au .aw.ax.az.ba.bb.bd.be.bf.bg.bh.bi.biz.bj.bm.bn.bo.br.bs.bt.bv.bw.by.bz.ca .cat.cc.cd.cf.cg.ch.ci.ck.cl.cm.cn.co.com.coop.cr.cs.cu.cv.cx.cy.cz.dd.de .dj.dk.dm.do.dz.ec.edu.ee.eg.eh.er.es.et.eu.fi.firm.fj.fk.fm.fo.fr.fx.ga @@ -17,50 +18,53 @@ function extractor.getURLS(text_with_URLs) .to.tp.tr.travel.tt.tv.tw.tz.ua.ug.uk.um.us.uy.va.vc.ve.vg.vi.vn.vu.web.wf .ws.xxx.ye.yt.yu.za.zm.zr.zw]] - local tlds = {} + local tlds = {} - for tld in domains:gmatch'%w+' do - tlds[tld] = true - end - local function max4(a,b,c,d) return math.max(a+0, b+0, c+0, d+0) end - local protocols = {[''] = 0, ['http://'] = 0, ['https://'] = 0, ['ftp://'] = 0} - local finished = {} + for tld in domains:gmatch '%w+' do tlds[tld] = true end + local function max4(a, b, c, d) + return math.max(a + 0, b + 0, c + 0, d + 0) + end + local protocols = { + [''] = 0, + ['http://'] = 0, + ['https://'] = 0, + ['ftp://'] = 0 + } + local finished = {} - for pos_start, url, prot, subd, tld, colon, port, slash, path in - text_with_URLs:gmatch'()(([%w_.~!*:@&+$/?%%#-]-)(%w[-.%w]*%.)(%w+)(:?)(%d*)(/?)([%w_.~!*:@&+$/?%%#=-]*))' - do - if protocols[prot:lower()] == (1 - #slash) * #path and not subd:find'%W%W' - and (colon == '' or port ~= '' and port + 0 < 65536) - and (tlds[tld:lower()] or tld:find'^%d+$' and subd:find'^%d+%.%d+%.%d+%.$' - and max4(tld, subd:match'^(%d+)%.(%d+)%.(%d+)%.$') < 256) - then - finished[pos_start] = true - print(pos_start, url) - end - end + for pos_start, url, prot, subd, tld, colon, port, slash, path in + text_with_URLs:gmatch '()(([%w_.~!*:@&+$/?%%#-]-)(%w[-.%w]*%.)(%w+)(:?)(%d*)(/?)([%w_.~!*:@&+$/?%%#=-]*))' do + if protocols[prot:lower()] == (1 - #slash) * #path and + not subd:find '%W%W' and + (colon == '' or port ~= '' and port + 0 < 65536) and + (tlds[tld:lower()] or tld:find '^%d+$' and + subd:find '^%d+%.%d+%.%d+%.$' and + max4(tld, subd:match '^(%d+)%.(%d+)%.(%d+)%.$') < 256) then + finished[pos_start] = true + print(pos_start, url) + end + end - for pos_start, url, prot, dom, colon, port, slash, path in - text_with_URLs:gmatch'()((%f[%w]%a+://)(%w[-.%w]*)(:?)(%d*)(/?)([%w_.~!*:@&+$/?%%#=-]*))' - do - if not finished[pos_start] and not (dom..'.'):find'%W%W' - and protocols[prot:lower()] == (1 - #slash) * #path - and (colon == '' or port ~= '' and port + 0 < 65536) - then - print(pos_start, url) - end - end + for pos_start, url, prot, dom, colon, port, slash, path in + text_with_URLs:gmatch '()((%f[%w]%a+://)(%w[-.%w]*)(:?)(%d*)(/?)([%w_.~!*:@&+$/?%%#=-]*))' do + if not finished[pos_start] and not (dom .. '.'):find '%W%W' and + protocols[prot:lower()] == (1 - #slash) * #path and + (colon == '' or port ~= '' and port + 0 < 65536) then + print(pos_start, url) + end + end end -function main() - local input = io.read("*a") - for k,v in pairs(arg) do - if v == "--url" then - extractor.getURLS(input) - else - print("invalid args...") - os.exit(1) - end - end +local function main() + local input = io.read("*a") + for _, v in pairs(arg) do + if v == "--url" then + extractor.getURLS(input) + else + print("invalid args...") + os.exit(1) + end + end end main() -- cgit v1.2.3