regular expression to read the string between <title> and </title>
I hope to read the contents between and in a html string.
I think it should be in objective-c
@"<title([\\s\\S]*)</title>"
below are the codes that rewrited for regular expression
//source of NSStringCategory.h
#import <Foundation/Foundation.h>
#import <regex.h>
@interface NSStringCategory:NSObject
{
regex_t preg;
}
-(id)initWithPattern:(NSString *)pattern options:(int)options;
-(void)dealloc;
-(BOOL)matchesString:(NSString *)string;
-(NSString *)matchedSubstringOfString:(NSString *)string;
-(NSArray *)capturedSubstringsOfString:(NSString *)string;
+(NSStringCategory *)regexWithPattern:(NSString *)pattern options:(int)options;
+(NSStringCategory *)regexWithPattern:(NSString *)pattern;
+(NSString *)null;
+(void)initialize;
@end
@interface NSString (NSStringCategory)
-(BOOL)matchedByPattern:(NSString *)pattern options:(int)options;
-(BOOL)matchedByPattern:(NSString *)pattern;
-(NSString *)substringMatchedByPattern:(NSString *)pattern options:(int)options;
-(NSString *)substringMatchedByPattern:(NSString *)pattern;
-(NSArray *)substringsCapturedByPattern:(NSString *)pattern options:(int)options;
-(NSArray *)substringsCapturedByPattern:(NSString *)pattern;
-(NSString *)escapedPattern;
@end
and .m file
#import "NSStringCategory.h"
static NSString *nullstring=nil;
@implementation NSStringCategory
-(id)initWithPattern:(NSString *)pattern options:(int)options
{
if(self=[super init])
{
int err=regcomp(&preg,[pattern UTF8String],options|REG_EXTENDED);
if(err)
{
char errbuf[256];
regerror(err,&preg,errbuf,sizeof(errbuf));
[NSException raise:@"CSRegexException"
format:@"Could not compile regex \"%@\": %s",pattern,errbuf];
}
}
return self;
}
-(void)dealloc
{
regfree(&preg);
[super dealloc];
}
-(BOOL)matchesString:(NSString *)string
{
if(regexec(&preg,[string UTF8String],0,NULL,0)==0) return YES;
return NO;
}
-(NSString *)matchedSubstringOfString:(NSString *)string
{
const char *cstr=[string UTF8String];
regmatch_t match;
if(regexec(&preg,cstr,1,&match,0)==0)
{
return [[[NSString alloc] initWithBytes:cstr+match.rm_so
length:match.rm_eo-match.rm_so encoding:NSUTF8StringEncoding] autorelease];
}
return nil;
}
-(NSArray *)capturedSubstringsOfString:(NSString *)string
{
const char *cstr=[string UTF8String];
int num=preg.re_nsub+1;
regmatch_t *matches=calloc(sizeof(regmatch_t),num);
if(regexec(&preg,cstr,num,matches,0)==0)
{
NSMutableArray *array=[NSMutableArray arrayWithCapacity:num];
int i;
for(i=0;i<num;i++)
{
NSString *str;
if(matches[i].rm_so==-1&&matches[i].rm_eo==-1) str=nullstring;
else str=[[[NSString alloc] initWithBytes:cstr+matches[i].rm_so
length:matches[i].rm_eo-matches[i].rm_so encoding:NSUTF8StringEncoding] autorelease];
[array addObject:str];
}
free(matches);
return [NSArray arrayWithArray:array];
}
free(matches);
return nil;
}
+(NSStringCategory *)regexWit开发者_如何学编程hPattern:(NSString *)pattern options:(int)options
{ return [[[NSStringCategory alloc] initWithPattern:pattern options:options] autorelease]; }
+(NSStringCategory *)regexWithPattern:(NSString *)pattern
{ return [[[NSStringCategory alloc] initWithPattern:pattern options:0] autorelease]; }
+(NSString *)null { return nullstring; }
+(void)initialize
{
if(!nullstring) nullstring=[[NSString alloc] initWithString:@""];
}
@end
@implementation NSString (NSStringCategory)
-(BOOL)matchedByPattern:(NSString *)pattern options:(int)options
{
NSStringCategory *re=[NSStringCategory regexWithPattern:pattern options:options|REG_NOSUB];
return [re matchesString:self];
}
-(BOOL)matchedByPattern:(NSString *)pattern
{ return [self matchedByPattern:pattern options:0]; }
-(NSString *)substringMatchedByPattern:(NSString *)pattern options:(int)options
{
NSStringCategory *re=[NSStringCategory regexWithPattern:pattern options:options];
return [re matchedSubstringOfString:self];
}
-(NSString *)substringMatchedByPattern:(NSString *)pattern
{ return [self substringMatchedByPattern:pattern options:0]; }
-(NSArray *)substringsCapturedByPattern:(NSString *)pattern options:(int)options
{
NSStringCategory *re=[NSStringCategory regexWithPattern:pattern options:options];
return [re capturedSubstringsOfString:self];
}
-(NSArray *)substringsCapturedByPattern:(NSString *)pattern
{ return [self substringsCapturedByPattern:pattern options:0]; }
-(NSString *)escapedPattern
{
int len=[self length];
NSMutableString *escaped=[NSMutableString stringWithCapacity:len];
for(int i=0;i<len;i++)
{
unichar c=[self characterAtIndex:i];
if(c=='^'||c=='.'||c=='['||c=='$'||c=='('||c==')'
||c=='|'||c=='*'||c=='+'||c=='?'||c=='{'||c=='\\') [escaped appendFormat:@"\\%C",c];
else [escaped appendFormat:@"%C",c];
}
return [NSString stringWithString:escaped];
}
@end
I use the codes below to get the string between "" and ""
NSStringCategory *a=[[NSStringCategory alloc] initWithPattern:@"<title([\s\S]*)</title>" options:0];//
Unfortunately [a matchedSubstringOfString:response]] always returns nil
I do not if the regular expression is wrong or any other reason.
Welcome any comment
Thanks
interdev
(Preliminary warning: you can't parse HTML correctly with Regex.)
You are using regex.h
, which provides POSIX regular expression (ERE in your case). They do not support all of the PCRE syntax such as \s
and \S
(and [\s\S]
is useless anyway — it matches anything).
Probably you should use
initWithPattern:@"<title[^>]*>([^<]*)</title>" options:REG_ICASE
<title[^>]*>\([^<]*\)</title>
should do the trick.
For this specific case, I might try instantiating a WebDocumentRepresentation object from the /System/Library/Frameworks/WebKit framework.
You could set the data source for the WebDocumentRepresentation object to the HTML page you were interested in, and then use the object's title method to return the title.
Here's the Mac OSX Reference Library document on the object.
精彩评论