How can I extract the domain from a URL?
I'm currently making a few changes in the rTorrent source. I have the following code:
torrent::Object
appl开发者_开发知识库y_to_domain(const torrent::Object& rawArgs) {
const char * url = rawArgs.as_string().c_str();
char buffer[50];
snprintf(buffer, 50, "URL: %s.", url);
return std::string(buffer);
}
I need to extract the domain from url. There's a regex.h included in the source but I'm not sure if I can use that or if I need to use a different regex library.
Link to regex.h
The only thing that "regex" implementation handles is the wildcard character, *
. (BTW, I'm just assuming it's a wildcard, since it's the only character that's recognised and the comments seem to hint as much, but I haven't actually verified it.)
Use a proper regex library like Boost.Regex.
// This is a hacked up whole string pattern matching. Replace with
// TR1's regex when that becomes widely available. It is intended for
// small strings.
That's not going to work for extracting the domain. Use Boost or VSCRT TR1 instead.
See *get_active_tracker_domain* in command_pyroscope.cc
In windows:
#include <winsock2.h>
#include <windows.h>
#include <iostream>
#include <vector>
#include <string>
#include <algorithm>
#include <cctype>
#include <locale>
#pragma comment(lib,"ws2_32.lib")
using namespace std;
string website_HTML;
locale local;
//***************************
void get_Website(char *url );
void extract_URL();
//***************************
int main ()
{
char *url="www.bbc.com";
get_Website(url );
extract_URL();
return 0;
}
//***************************
void get_Website(char *url )
{
WSADATA wsaData;
SOCKET Socket;
SOCKADDR_IN SockAddr;
int lineCount=0;
int rowCount=0;
struct hostent *host;
char *get_http= new char[256];
memset(get_http,' ', sizeof(get_http) );
strcpy(get_http,"GET / HTTP/1.1\r\nHost: ");
strcat(get_http,url);
strcat(get_http,"\r\nConnection: close\r\n\r\n");
if (WSAStartup(MAKEWORD(2,2), &wsaData) != 0)
{
cout << "WSAStartup failed.\n";
exit(0);
}
Socket=socket(AF_INET,SOCK_STREAM,IPPROTO_TCP);
host = gethostbyname(url);
SockAddr.sin_port=htons(80);
SockAddr.sin_family=AF_INET;
SockAddr.sin_addr.s_addr = *((unsigned long*)host->h_addr);
cout << "Connecting to ["<< url<<"]...\n";
if(connect(Socket,(SOCKADDR*)(&SockAddr),sizeof(SockAddr)) != 0)
{
cout << "Could not connect\n";
exit(0);
}
cout << "Connected. (success!)\n";
std::cout << std::flush;
send(Socket,get_http, strlen(get_http),0 );
char buffer[10000];
int nDataLength;
int i = 0;
while ((nDataLength = recv(Socket,buffer,10000,0)) > 0)
{
while (buffer[i] >= 32 || buffer[i] == '\n' || buffer[i] == '\r')
{
website_HTML+=buffer[i];
i += 1;
}
}
cout<<"\n"<<i<<" bytes downloaded \n\n";
closesocket(Socket);
WSACleanup();
delete[] get_http;
}
void extract_URL()
{
for (size_t i=0; i<website_HTML.length(); ++i) website_HTML[i]= tolower(website_HTML[i],local);
std::string to_find = "http:";
std::vector<string> extracted_website_URL;
std::string string_to_split;
char chr_String[1000];
int count = 0;
char seps[] = "\"";
char *token;
cout << "\nExtracting url.. ";
for (int j = 0; j < website_HTML.length() - to_find.length(); j++)
{
if (website_HTML.substr(j, to_find.length()) == to_find)
{
count++;
string_to_split=website_HTML.substr(j, to_find.length()+256);
strcpy(chr_String , string_to_split.c_str() );
token = strtok( chr_String, seps );
extracted_website_URL.push_back(token);
//cout<<website_HTML.substr(j, to_find.length()+30)<<" \n";
}
std::cout << "\b\\" << std::flush;
std::cout << "\b|" << std::flush;
std::cout << "\b/" << std::flush;
std::cout << "\b-" << std::flush;
}
for(j=0;j<extracted_website_URL.size();j++) cout<<extracted_website_URL[j] <<" \n";
cout<<"\n"<<extracted_website_URL.size()<<" URL's extracted ";
cout<<"\n\n";
}
something basic but that may do the job:
#include <regex>
std::string getHostFromUrl(const std::string & url) {
std::regex urlRe("^.*://([^/?:]+)/?.*$");
return std::regex_replace(url, urlRe, "$1");
}
Try C++11 Regex:
#include <iostream>
#include <string>
#include <regex>
int main()
{
std::string str("The link of this question: https://stackoverflow.com/questions/3073753/how-can-i-extract-the-domain-from-a-url \
Other urls are https://www.google.com, facebook.com. https://my_site.online.com:1234");
std::regex r("https?:\\/\\/(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}");
std::smatch sm;
while(regex_search(str, sm, r))
{
std::cout << sm.str() << '\n';
str = sm.suffix();
}
}
in Qt, you can use QUrl:
QString url("https://somedomain.com/index/of/somepage/blah/blah");
QUrl qu(url);
qDebug() << "qu.host " << qu.host();
it will give you : somedomain.com
精彩评论