Download A Webpage & Extract Links From It

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

function get_links () {
page=`wget --header='Accept-Encoding: gzip' -q "$1" -O - 2>/dev/null`
stripped=`echo "$page" | tr "\t\r\n'" ' "' 2>/dev/null`

# page may be compressed
if [ $? -gt 0 ]
then
stripped=`echo "$page" | gunzip -q -c | tr "\t\r\n'" ' "' 2>/dev/null`
fi

echo "$stripped" | \
grep -i -o '<a[^>]\+href[ ]*=[ \t]*"[^"]\+"' | \
sed -e 's/^.*"\([^"]\+\)".*$/\1/g'

# for link in links if http not in link, append origin of $1
}
1
get_links http://eloquentjavascript.net/