Connection Google

Hi,

I'm doing a program to return a keyword search page from google. In the program below the output will be the code of the page (e.g. using the keyword "water"). The problem is that what I want for the output is only the links in separate lines.

How can I do that? Regular Expressions or is there a better method?

Your help will be very useful for me. :-)

Thanks in advance

mp

import java.net.*;

public class ReadGoogle {

public static void main(String[] args) throws Exception {

URL google = new URL("http://www.google.pt/search?hl=pt-PT&q=water&meta=");

BufferedReader in = new BufferedReader(new InputStreamReader(google.openStream()));

String inputLine;

while ((inputLine = in.readLine()) != null)

System.out.println(inputLine);

in.close();

}

}

[863 byte] By [mpea] at [2007-11-26 18:49:31]
# 1

hi there

first I'm not really understand you meaning, sorry for my bad english, anyway if the key word "water" is what you want, you could first make the query request, then with response return by the server, you search for a particular line or words that can uniquely identify the key word "water", eg. the google search input text box, but unfortunately you can't do this by the URL Class, as google will alway return you a 403 error, to do this you need to implement your own socket program to make the http request, this is exactly what I'm doing, and now I'm stucking on the session problem, anyway if you want here is am example, just copyand run it and you will get your "water" :-)

import java.net.*;

import java.io.*;

import java.util.Properties;

import java.util.Enumeration;

import java.util.regex.Pattern;

import java.util.regex.Matcher;

public class Http{

protected Socket client;

protected BufferedOutputStream sender;

protected BufferedInputStream receiver;

protected ByteArrayInputStream byteStream;

protected URL target;

private int responseCode=-1;

private String responseMessage="";

private String serverVersion="";

private Properties header=new Properties();

public Http()

{}

public Http(String url)

{

GET(url);

}

public void GET(String url)

{

try

{

checkHTTP(url);

openServer(target.getHost(), target.getPort());

String cmd="GET "+getURLFormat(target)+" HTTP/1.0\r\n"+

getBaseHeads()+"\r\n";

sendMessage(cmd);

receiveMessage();

}

catch(ProtocolException p)

{

p.printStackTrace();

return;

}

catch(UnknownHostException e)

{

e.printStackTrace();

return;

}

catch(IOException i)

{

i.printStackTrace();

return;

}

}

public void HEAD(String url)

{

try

{

checkHTTP(url);

openServer(target.getHost(), target.getPort());

String cmd="HEAD "+getURLFormat(target)+" HTTP/1.0\r\n"+

getBaseHeads()+"\r\n";

sendMessage(cmd);

receiveMessage();

}

catch(ProtocolException p)

{

p.printStackTrace();

return;

}

catch(UnknownHostException e)

{

e.printStackTrace();

return;

}

catch(IOException i)

{

i.printStackTrace();

return;

}

}

public void POST(String url, String content)

{

try

{

checkHTTP(url);

openServer(target.getHost(), target.getPort());

String cmd="POST "+getURLFormat(target)+" HTTP/1.0\r\n"+

getBaseHeads();

cmd+="Content-type: application/x-www-form-urlencoded\r\n";

cmd+="Content-length: "+content.length()+"\r\n\r\n";

cmd+=content+"\r\n";

sendMessage(cmd);

receiveMessage();

}

catch(ProtocolException p)

{

p.printStackTrace();

return;

}

catch(UnknownHostException e)

{

e.printStackTrace();

return;

}

catch(IOException i)

{

i.printStackTrace();

return;

}

}

protected void checkHTTP(String url) throws ProtocolException

{

try

{

URL target=new URL(url);

if(target==null||

!target.getProtocol().toUpperCase().equals("HTTP"))

{

throw new ProtocolException("\u8FD9\u4E0D\u662FHTTP\u534F\u8BAE");

}

this.target=target;

}

catch(MalformedURLException m)

{

throw new ProtocolException("\u534F\u8BAE\u683C\u5F0F\u9519\u8BEF");

}

}

protected void openServer(String host, int port) throws UnknownHostException,

IOException

{

header.clear();

responseMessage="";

responseCode=-1;

if(client!=null)

{

closeServer();

}

if(byteStream!=null)

{

byteStream.close();

byteStream=null;

}

InetAddress address=InetAddress.getByName(host);

client=new Socket(address, port==-1?80:port);

client.setSoTimeout(5000);

sender=new BufferedOutputStream(client.getOutputStream());

receiver=new BufferedInputStream(client.getInputStream());

}

protected void closeServer() throws IOException

{

if(client==null)

{

return;

}

try

{

client.close();

sender.close();

receiver.close();

}

catch(IOException i)

{

throw i;

}

client=null;

sender=null;

receiver=null;

}

protected String getURLFormat(URL target)

{

String spec="http://"+target.getHost();

if(target.getPort()!=-1)

{

spec+=":"+target.getPort();

}

return spec+=target.getFile();

}

protected void sendMessage(String data) throws IOException

{

sender.write(data.getBytes(), 0, data.length());

sender.flush();

}

protected void receiveMessage() throws IOException

{

byte data[]=new byte[1024];

int count=0;

int word=-1;

while((word=receiver.read())!=-1)

{

if(word=='\r'||word=='\n')

{

word=receiver.read();

if(word=='\n')

{

word=receiver.read();

}

break;

}

if(count==data.length)

{

data=addCapacity(data);

}

data[count++]=(byte)word;

}

String message=new String(data, 0, count);

int mark=message.indexOf(32);

serverVersion=message.substring(0, mark);

while(mark<message.length()&&message.charAt(mark+1)==32)

{

mark++;

}

responseCode=Integer.parseInt(message.substring(mark+1, mark+=4));

responseMessage=message.substring(mark, message.length()).trim();

switch(responseCode)

{

case 400:

throw new IOException("\u9519\u8BEF\u8BF7\u6C42");

case 404:

throw new FileNotFoundException(getURLFormat(target));

case 503:

throw new IOException("\u670D\u52A1\u5668\u4E0D\u53EF\u7528");

}

if(word==-1)

{

throw new ProtocolException("\u4FE1\u606F\u63A5\u6536\u5F02\u5E38\u7EC8\u6B62");

}

int symbol=-1;

count=0;

while(word!='\r'&&word!='\n'&&word>-1)

{

if(word=='\t')

{

word=32;

}

if(count==data.length)

{

data=addCapacity(data);

}

data[count++]=(byte)word;

parseLine:

{

while((symbol=receiver.read())>-1)

{

switch(symbol)

{

case '\t':

symbol=32;

break;

case '\r':

case '\n':

word=receiver.read();

if(symbol=='\r'&&word=='\n')

{

word=receiver.read();

if(word=='\r')

{

word=receiver.read();

}

}

if(word=='\r'||word=='\n'||word>32)

{

break parseLine;

}

symbol=32;

break;

}

if(count==data.length)

{

data=addCapacity(data);

}

data[count++]=(byte)symbol;

}

word=-1;

}

message=new String(data, 0, count);

mark=message.indexOf(':');

String key=null;

if(mark>0)

{

key=message.substring(0, mark);

}

mark++;

while(mark<message.length()&&message.charAt(mark)<=32)

{

mark++;

}

String value=message.substring(mark, message.length());

header.put(key, value);

count=0;

}

while((word=receiver.read())!=-1)

{

if(count==data.length)

{

data=addCapacity(data);

}

data[count++]=(byte)word;

}

if(count>0)

{

byteStream=new ByteArrayInputStream(data, 0, count);

}

data=null;

closeServer();

}

public String getResponseMessage()

{

return responseMessage;

}

public int getResponseCode()

{

return responseCode;

}

public String getServerVersion()

{

return serverVersion;

}

public InputStream getInputStream()

{

return byteStream;

}

public synchronized String getHeaderKey(int i)

{

if(i>=header.size())

{

return null;

}

Enumeration enumss=header.propertyNames();

String key=null;

for(int j=0; j<=i; j++)

{

key=(String)enumss.nextElement();

}

return key;

}

public synchronized String getHeaderValue(int i)

{

if(i>=header.size())

{

return null;

}

return header.getProperty(getHeaderKey(i));

}

public synchronized String getHeaderValue(String key)

{

return header.getProperty(key);

}

protected String getBaseHeads()

{

String inf="User-Agent: ZealHttp/1.0\r\nAccept: www/source; text/html; image/gif; */*\r\n";

return inf;

}

private byte[] addCapacity(byte rece[])

{

byte temp[]=new byte[rece.length+1024];

System.arraycopy(rece, 0, temp, 0, rece.length);

return temp;

}

public static void main(String[] arg)

{

try

{

Http http=new Http();

String rep="";

http.GET("http://www.google.com/search?hl=zh-TW&q=water");

BufferedReader br=new BufferedReader(new InputStreamReader(http.getInputStream()));

String html="";

Pattern pat=Pattern.compile("<input type=text name=q size=41 maxlength=2048 value=[^\n]* title=");

Matcher match=null;

while((rep=br.readLine())!=null)

{

match=pat.matcher(rep);

if(match.find())

{

html=match.group();

break;

}

}

System.out.println(html.substring(html.indexOf("\"")+1, html.lastIndexOf("\"")));

br.close();

http.closeServer();

}

catch(Exception e)

{

e.printStackTrace();

}

}

}

AlfredRomeoa at 2007-7-9 6:23:33 > top of Java-index,Core,Core APIs...
# 2

Hi,

What I want is to send a keyword to google (any keyword) and the java program return the 10 links returned by google in separate lines

example: for the keyword soccer:

http://www.soccer.com

http://www.soccernet.com

....

Could you help me

Thanks

mp

mpea at 2007-7-9 6:23:33 > top of Java-index,Core,Core APIs...
# 3

First, the 403 error mentioned in reply 1 is because for whatever reason Google search requires the User-Agent header. This should take care of it:

URLConnection connection = url.openConnection();

connection.addRequestProperty("User-Agent", "Mozilla Firefox 0.9b2");

To the original question: I can think of two ways: [url=http://en.wikipedia.org/wiki/Screen_scraping]screen scraping[/url] and the Google search API.

Screen scraping is simple: do the query, then search for the result in the returned HTML. The HTML contains something like this:

(the forum software seems buggered; I'll replace less-than and greater-than with parentheses)

(div class=g)(a href="http://ga.water.usgs.gov/edu/"

So to find the URL search for the string "(div class=g)(a href=\"", then search for the terminating ". You can do this with String.indexOf() or a regex.

The downside of screen scraping is that it is sensitive to changes in the web page. If Google decides to alter the layout of the page even a little bit, the magic "(div class=g)" can go away and the program needs to be fixed.

The alternative is Google search API. I don't know about its current status; I understand Google dropped their SOAP API a while back. They still have an AJAX search API but I know nothing about it. Search for it.

sjasjaa at 2007-7-9 6:23:33 > top of Java-index,Core,Core APIs...