Hacker News new | past | comments | ask | show | jobs | submit login
Extract URLs Relative and/or Absolute yy044
1 point by textmode on Dec 21, 2023 | hide | past | favorite | 1 comment

       /*
            experimental
            usage: [host=example.com] yy044 [ar]
       */
    int fileno (FILE *);
    int x,y,absolute,relative,omit,proto;
    int setenv (const char *, const char *, int);
    int fprintf(FILE *__restrict, const char *__restrict, ...);
    size_t strlen (const char *);
   http https://|http://
   js javascript:
   %option nounput noinput noyywrap 
   %%
   href=[^{http}{js}\"'][^\"'\40>]+ puts("yy044");
   
   href=[\"'][^{http}{js}][^\"'\40>]+ {
    omit=0;proto=0;
    if(yytext[6]==46)omit++;
    if(yytext[7]==47)proto++;
    if(!omit)if(relative)
    {
    y=0;for(x=6;x<yyleng;x++)
    {
    if(yytext[x]){yytext[y]=yytext[x];y++;};
    }
    if(proto)fwrite("https:",1,6,yyout);
    if(!proto)if(getenv("host")||getenv("Host"))fwrite("https://",1,8,yyout);
    if(!proto)if(getenv("host")){fwrite(getenv("host"),1,strlen(getenv("host")),yyout);goto x;};
    if(!proto)if(getenv("Host")){fwrite(getenv("Host"),1,strlen(getenv("Host")),yyout);goto x;};
    x:
    fwrite(yytext,1,yyleng-6,yyout);
    putc(10,stdout);
    }
    }
            /*
              javascript:void(0)
              #blah
            */
   "https://"[^\40<>\43\42\47|\r\n]+ {
    if(absolute)
    if(yyleng>8)
    {
    y=0;for(x=0;x<yyleng;x++){if(yytext[x])yytext[y]=yytext[x];y++;};
    fwrite(yytext,1,yyleng,yyout);
    putc(10,stdout);
    }
    }
   "http://"[^\40<>\43\42\47|\r\n]+ {
    if(absolute)
    if(yyleng>7)
    {
    y=0;for(x=0;x<yyleng;x++){if(yytext[x])yytext[y]=yytext[x];y++;};
    fwrite(yytext,1,yyleng,yyout);
    putc(10,stdout);
    }
    }
         /*
             href=https://x/http://
         */
   .|\n
   %%
          int main(int argc,char* argv[])
          { 
          absolute++;relative++;
          if(argc)if(argv[1])
          {
          if(argv[1][0]==97)relative--;
          if(argv[1][0]==114)absolute--;
          }
          yylex();
          exit(0);
          }



Normally I use yy030 but I have been experimenting with this instead.

Seems to be slightly faster and smaller than similar programs from html-xml-utils.

https://www.w3.org/Tools/HTML-XML-utils/man1/

Compile:

   links -no-connect -dump https://news.ycombinator.com/item?id=38727772 \
   |sed '1,4d;77,$d;s/[ ]\{6\}//' \
   |flex -8Cem;cc -O3 -std=c89 -W -Wall -pipe lex.yy.c -static -o yy044
   strip -s yy044
Example usage:

      # NB. not a real cookie
      curl -H "cookie=user=santa&K7RGzmUtAoKv9OIRMfQ9bfwYpiDEuypp" -siA "" \
      https://news.ycombinator.com \
      |host=news.ycombinator.com/ yy044 r \
      |sed -n 's/&amp;/\&/g;/vote/p'




Consider applying for YC's Fall 2025 batch! Applications are open till Aug 4

Guidelines | FAQ | Lists | API | Security | Legal | Apply to YC | Contact

Search: