#!/usr/bin/perl -w
my $VERSION = "0.13";
# vim: set sw=4 ts=4 si et:
# Copyright: GPL
# Author: Guido Socher
#
use strict;
use vars qw($opt_h $opt_l);
use Getopt::Std;
#
sub help();
#------------------
getopts("lh")||die "ERROR: No such option. -h for help.\n";
help() if ($opt_h);
help() unless($ARGV[0]);
my $pattern=shift;
$/='<'; # record seperator, normally "\n" but in html "\n" means nothing
my $i=1;
my $l=1;
while(<>){
while(s/\r?\n/ /){
# keep track of line numbers
$i++;
}
# take away the end tag:
s/>.*//;
# kill multiple space
s/[ \t]+/ /g;
if(/$pattern/io){
# matches this tag. Print filename:linenumber: matched tag
print "${ARGV}:${i}: " if ($opt_l);
print "<$_>\n";
$l=$i; # we want to count the line where the tag starts
next;
}
$l=$i; # we want to count the line where the tag starts
}
#
sub help(){
print "tr_tagcontentgrep -- grep for a xml/sgml/html tag
USAGE: tr_tagcontentgrep [-hl] regexp-pattern [file ...]
tr_tagcontentgrep opens all files provided on the command line
and searches for the given pattern in the tags. The search
is not case sensitive.
All space in the tags is reduced to max. one space. You can
search for \"a href\" even if the original tag had
multiple spaces between \"a\" and \"href\".
OPTIONS:
-h this help
-l list filename and line number
EXAMPLE:
tr_tagcontentgrep -l img file.html
would e.g print something like:
index.html:53:
index.html:257:
tr_tagcontentgrep is part of the HTML::TagReader package
but is an example that you can also do 'reading by tag'
without HTML::TagReader. tr_tagcontentgrep uses plain perl
and sets the \$/ variable.
Working without HTML::TagReader causes however problems when working
with faulty html code where single '<'-characters appear
somewhere in the text.
version $VERSION
";
exit(0);
}
__END__
=head1 NAME
tr_tagcontentgrep -- grep for a xml/sgml/html tag
=head1 SYNOPSIS
tr_tagcontentgrep [-hl] regexp-pattern [file ...]
=head1 DESCRIPTION
tr_tagcontentgrep opens all files provided on the command line
and searches for the given pattern in the tags. The search
is not case sensitive.
tr_tagcontentgrep is part of the HTML::TagReader package
but is an example that you can also do 'reading by tag'
without HTML::TagReader. tr_tagcontentgrep uses plain perl
and sets the $/ variable.
Working without HTML::TagReader causes however problems when working
with faulty html code where single '<'-characters appear
somewhere in the text.
All space in the tags is reduced to max. one space. You can
search for "a href" even if the original tag had
multiple spaces between "a" and "href".
=head1 OPTIONS
-h this help
-l list filename and line number
=head1 EXAMPLE
tr_tagcontentgrep -l img file.html
would e.g print something like:
index.html:53:
index.html:257:
=head1 AUTHOR
tr_tagcontentgrep is part of the HTML::TagReader package and was written by
Guido Socher [guido(at)linuxfocus.org]
=cut