Subversion Repositories PlanixRsrch.SVN

Compare Revisions

Ignore whitespace Rev 388 → Rev 394

/tags/Crl/0.0.1a/CHANGES
0,0 → 1,10
08-07-2020
----------
Version 0.0.1a
Minor performance improvements regarding derby.
 
07-01-2020
----------
Version 0.0.1
Crl to crawl only DE domains using DerbyDB to store state.
Crl is very slow because of DerbyDB.
/tags/Crl/0.0.1a/README
0,0 → 1,0
Crl is a simple web crawler written in java.
/tags/Crl/0.0.1a/nbactions.xml
0,0 → 1,46
<?xml version="1.0" encoding="UTF-8"?>
<actions>
<action>
<actionName>run</actionName>
<packagings>
<packaging>jar</packaging>
</packagings>
<goals>
<goal>process-classes</goal>
<goal>org.codehaus.mojo:exec-maven-plugin:1.5.0:exec</goal>
</goals>
<properties>
<exec.args>-classpath %classpath org.planix.crl.Crl</exec.args>
<exec.executable>java</exec.executable>
</properties>
</action>
<action>
<actionName>debug</actionName>
<packagings>
<packaging>jar</packaging>
</packagings>
<goals>
<goal>process-classes</goal>
<goal>org.codehaus.mojo:exec-maven-plugin:1.5.0:exec</goal>
</goals>
<properties>
<exec.args>-agentlib:jdwp=transport=dt_socket,server=n,address=${jpda.address} -classpath %classpath org.planix.crl.Crl</exec.args>
<exec.executable>java</exec.executable>
<jpda.listen>true</jpda.listen>
</properties>
</action>
<action>
<actionName>profile</actionName>
<packagings>
<packaging>jar</packaging>
</packagings>
<goals>
<goal>process-classes</goal>
<goal>org.codehaus.mojo:exec-maven-plugin:1.5.0:exec</goal>
</goals>
<properties>
<exec.args>-classpath %classpath org.planix.crl.Crl</exec.args>
<exec.executable>java</exec.executable>
</properties>
</action>
</actions>
/tags/Crl/0.0.1a/pom.xml
0,0 → 1,94
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.planix</groupId>
<artifactId>Crl</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>org.apache.derby</groupId>
<artifactId>derby</artifactId>
<version>10.15.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.derby</groupId>
<artifactId>derbynet</artifactId>
<version>10.15.2.0</version>
</dependency>
<dependency>
<groupId>de.malkusch.whois-server-list</groupId>
<artifactId>public-suffix-list</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
</dependencies>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
</properties>
 
<build>
<finalName>CRL</finalName>
<plugins>
 
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>2.4</version>
<configuration>
<archive>
<manifest>
<addClasspath>true</addClasspath>
<mainClass>org.planix.crl.Crl</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
 
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.1.1</version>
 
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<manifest>
<addClasspath>true</addClasspath>
<mainClass>org.planix.crl.Crl</mainClass>
</manifest>
</archive>
</configuration>
 
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
 
</plugin>
 
</plugins>
</build>
</project>
/tags/Crl/0.0.1a/src/main/java/org/planix/crl/Crl.java
0,0 → 1,167
package org.planix.crl;
 
import de.malkusch.whoisServerList.publicSuffixList.*;
import java.util.concurrent.atomic.*;
 
import java.security.MessageDigest;
import java.nio.charset.StandardCharsets;
 
import java.net.*;
import java.util.HashMap;
 
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
/**
*
* @author tube
*/
public class Crl {
 
static Db db;
static PublicSuffixList suffixList;
static AtomicInteger fetch_threads;
static AtomicInteger parse_threads;
static HashMap<String, Integer> domains_connectd;
static final int MAX_FETCH_THREADS = 100;
static final int MAX_PARSE_THREADS = 10;
static final int MAX_CONNS_PER_DOMAIN = 6;
static final String DB_DIR = "/tmp/crl";
 
public static void main(String[] args) throws Exception {
 
domains_connectd = new HashMap<>();
fetch_threads = new AtomicInteger(0);
parse_threads = new AtomicInteger(0);
 
PublicSuffixListFactory factory = new PublicSuffixListFactory();
suffixList = factory.build();
 
db = new Db();
db.reset();
// db.insert(0, "http://www-i4.informatik.rwth-aachen.de/content/teaching/proseminars/sub/2002_2003_ws_docs/ip.pdf");
db.insert(0, "https://www.surfpoeten.de/");
/* db.insert(0, "https://www.spiegel.de");
db.insert(0, "https://www.google.de");
db.insert(0, "https://de.yahoo.com");
db.insert(0, "https://twitter.com");
db.insert(0, "https://bitcoinwisdom.io");
*/
 
/* db.insert(0, "https://www.surfpoeten.de/home");
db.insert(0, "https://www.surfpoeten.de/tube");
*/
// String dom = Crl.suffixList.getRegistrableDomain("windows.cauwersin.co.uk");
// System.out.printf("DOM: %s\n", dom);
System.out.printf("Threads: %d\n", Thread.activeCount());
do {
String url, parse_url;
 
url = null;
parse_url = null;
 
// System.out.printf("NUMBER OF FETCHERS/PARSERRS: %d/%d\n",
// fetch_threads.get(),
// parse_threads.get());
if (fetch_threads.get() < Crl.MAX_FETCH_THREADS ) {
url = db.getJob();
if (url != null) {
Fetch f = new Fetch(url, 0);
Crl.fetch_threads.incrementAndGet();
f.start();
}
}
if (parse_threads.get() < Crl.MAX_PARSE_THREADS) {
parse_url = db.getParseJob();
if (parse_url != null) {
Parser p = new Parser(parse_url, 0);
Crl.parse_threads.incrementAndGet();
p.start();
}
}
 
if (parse_url == null && url == null) {
Thread.sleep(5000);
}
 
} while (true /*url != null*/);
 
/* for (int i = 0; i < 100; i++) {
Fetch f = new Fetch("https://www.surfpoeten.de/", i);
f.start();
}
*/
/* while (fetch_threads.get() > 0) {
Thread.sleep(1000);
System.out.printf("Threads: %d\n", fetch_threads.get());
}*/
}
 
public static String getUrlFileName(String url) throws Exception {
MessageDigest digest = MessageDigest.getInstance("SHA-256");
byte[] encodedhash = digest.digest(
url.getBytes() /*StandardCharsets.UTF_8)*/);
 
return Crl.DB_DIR+"/crl_" + bytesToHex(encodedhash) + ".htm";
}
 
private static String bytesToHex(byte[] hash) {
StringBuffer hexString = new StringBuffer();
for (int i = 0; i < hash.length; i++) {
String hex = Integer.toHexString(0xff & hash[i]);
if (hex.length() == 1) {
hexString.append('0');
}
hexString.append(hex);
}
return hexString.toString();
}
 
public static String removeUrlFragment(String url) throws URISyntaxException {
java.net.URI uri = new URI(url);
return new java.net.URI(
uri.getScheme(),
uri.getUserInfo(),
uri.getHost(),
uri.getPort(),
uri.getPath(),
uri.getQuery(), null).toString();
}
 
public static synchronized boolean addDomainConnection(String domain) {
Integer count = Crl.domains_connectd.getOrDefault(domain, 0);
// System.out.printf("Connection Count for %s = %d\n", domain,count);
if (count >= Crl.MAX_CONNS_PER_DOMAIN) {
return false;
}
count++;
Crl.domains_connectd.put(domain, count);
return true;
}
 
public static synchronized boolean delDomainConnection(String domain) {
Integer count = Crl.domains_connectd.getOrDefault(domain, 0);
count--;
if (count <= 0) {
Crl.domains_connectd.remove(domain);
} else {
Crl.domains_connectd.put(domain, count);
}
return true;
}
 
public static String getUrlDomain(String link) throws URISyntaxException {
URI uri;
uri = new URI(link);
return Crl.suffixList.getRegistrableDomain(uri.getHost());
}
 
}
/tags/Crl/0.0.1a/src/main/java/org/planix/crl/Db.java
0,0 → 1,227
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package org.planix.crl;
 
import java.sql.*;
import java.time.Instant;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.*;
 
/**
*
* @author tube
*/
public class Db {
 
Connection conn = null;
 
Db() throws Exception {
System.out.println("Starting DB");
String driver = "org.apache.derby.jdbc.EmbeddedDriver";
Class.forName(driver).newInstance();
String protocol = "jdbc:derby:";
 
conn = DriverManager.getConnection(protocol + Crl.DB_DIR + "/derbyDB;create=true");
 
Statement stmt = conn.createStatement();
 
// Create table for urls to fetch
try {
stmt.execute("CREATE TABLE urls(prio INT "
+ "NOT NULL,url VARCHAR(2048))");
stmt.execute("CREATE INDEX urls_prio ON urls (prio)");
stmt.execute("CREATE UNIQUE INDEX urls_url ON urls (url)");
} catch (SQLException ex) {
if (!ex.getSQLState().endsWith("X0Y32")) {
throw (ex);
}
}
 
// create tabel for seen domains
try {
stmt.execute("CREATE TABLE domains(domain VARCHAR(1024))");
stmt.execute("CREATE UNIQUE INDEX domain_domain ON domains (domain)");
} catch (SQLException ex) {
if (!ex.getSQLState().endsWith("X0Y32")) {
throw (ex);
}
}
 
try {
stmt.execute("CREATE TABLE pages(time INT,"
+ "url VARCHAR(2048))");
stmt.execute("CREATE UNIQUE INDEX pages_url ON pages (url)");
stmt.execute("CREATE INDEX pages_time ON pages (time)");
} catch (SQLException ex) {
if (!ex.getSQLState().endsWith("X0Y32")) {
throw (ex);
}
}
conn.setAutoCommit(false);
}
 
public synchronized void insert(long prio, String url) throws SQLException {
try {
// System.out.printf("Insert into DB %d - %s\n", prio,url);
PreparedStatement ps = conn.prepareStatement("INSERT INTO urls (prio,url) VALUES(?,?)");
ps.setLong(1, prio);
ps.setString(2, url);
ps.execute();
} catch (SQLException ex) {
 
if (!ex.getSQLState().endsWith("23505")) {
throw ex;
}
}
}
 
public synchronized void insertMany(
ArrayList<Long> prios_a,
ArrayList<String> urls_a) throws SQLException {
// System.out.printf("Insert into DB %d - %s\n", prio,url);
PreparedStatement ps = conn.prepareStatement("INSERT INTO urls (prio,url) VALUES(?,?)");
 
// System.out.printf("Many Size %d\n", prios_a.size());
for (int i = 0; i < prios_a.size(); i++) {
try {
ps.clearParameters();
Long prio = prios_a.get(i);
String url = urls_a.get(i);
ps.setLong(1, prio);
ps.setString(2, url);
ps.execute();
// System.out.printf("Insert many %s %s\n", prio, url);
} catch (SQLException ex) {
if (!ex.getSQLState().endsWith("23505")) {
throw ex;
}
}
conn.commit();
}
 
}
 
public synchronized String getJob() throws SQLException {
Statement stmt = conn.createStatement();
PreparedStatement ps = conn.prepareStatement(
"SELECT url FROM urls WHERE prio > -1 AND prio < ?"
+ "ORDER BY prio FETCH FIRST 1 ROWS ONLY"
);
 
long unixTime = Instant.now().getEpochSecond();
ps.setLong(1, unixTime);
 
ResultSet rs = ps.executeQuery();
 
if (rs.next()) {
String url = rs.getString("url");
ps = conn.prepareStatement("UPDATE urls SET prio = -1 WHERE url = ?");
ps.setString(1, url);
ps.execute();
return url;
}
return null;
}
 
public synchronized String getParseJob() throws SQLException {
Statement stmt = conn.createStatement();
PreparedStatement ps = conn.prepareStatement(
"SELECT url FROM pages WHERE time > -1 "
+ "ORDER BY time FETCH FIRST 1 ROWS ONLY"
);
 
// long unixTime = Instant.now().getEpochSecond();
// ps.setLong(1, unixTime);
ResultSet rs = ps.executeQuery();
 
if (rs.next()) {
String url = rs.getString("url");
ps = conn.prepareStatement("UPDATE pages SET time = -1 WHERE url = ?");
ps.setString(1, url);
ps.execute();
return url;
}
return null;
}
 
public synchronized void endJob(String url, int next) throws SQLException {
PreparedStatement ps = conn.prepareStatement(
"UPDATE urls SET prio = ? WHERE url = ?");
long unixTime = Instant.now().getEpochSecond();
ps.setLong(1, unixTime + next);
ps.setString(2, url);
ps.execute();
}
 
public synchronized void endParseJob(String url) throws SQLException {
// PreparedStatement ps = conn.prepareStatement(
// "UPDATE pages SET time = ? WHERE url = ?");
 
PreparedStatement ps = conn.prepareStatement(
"DELETE FROM pages WHERE url = ?");
 
/* long unixTime = Instant.now().getEpochSecond();
ps.setLong(1, unixTime + next);
*/
ps.setString(1, url);
ps.execute();
}
 
public synchronized void addPage(String url) throws SQLException {
try {
 
PreparedStatement ps = conn.prepareStatement(
"INSERT INTO pages (time,url) VALUES(?,?)");
long unixTime = Instant.now().getEpochSecond();
ps.setLong(1, unixTime);
ps.setString(2, url);
ps.execute();
} catch (SQLException ex) {
if (!ex.getSQLState().endsWith("23505")) {
throw ex;
}
// Logger.getLogger(Db.class.getName()).log(Level.SEVERE, null, ex);
}
}
 
public synchronized boolean addDomain(String domain) throws SQLException {
try {
PreparedStatement ps;
ps = conn.prepareStatement(
"SELECT domain FROM domains WHERE domain=?");
ps.setString(1, domain);
ResultSet rs = ps.executeQuery();
 
if (rs.next()) {
return false;
}
 
ps = conn.prepareStatement(
"INSERT INTO domains (domain) VALUES(?)");
ps.setString(1, domain);
ps.execute();
} catch (SQLException ex) {
if (!ex.getSQLState().endsWith("23505")) {
throw ex;
}
// System.out.printf("Domain %s false\n", domain);
return false;
// Logger.getLogger(Db.class.getName()).log(Level.SEVERE, null, ex);
}
// System.out.printf("Domain %s true\n", domain);
return true;
}
 
synchronized void reset() throws SQLException {
Statement stmt = conn.createStatement();
stmt.execute("UPDATE urls SET prio=0 WHERE prio=-1");
// stmt.execute("UPDATE pages SET time=0 WHERE time=-1");
}
 
}
/tags/Crl/0.0.1a/src/main/java/org/planix/crl/Fetch.java
0,0 → 1,160
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package org.planix.crl;
 
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
 
import java.io.*;
import java.net.URISyntaxException;
import java.sql.SQLException;
import java.util.Random;
 
/**
*
* @author tube
*/
public class Fetch extends Thread {
 
String urlp;
Random random;
 
Fetch(String url, int i) {
this.i = i;
this.urlp = url;
random = new Random();
}
 
int i;
 
@Override
public void run() {
// Crl.fetch_threads.incrementAndGet();
// System.out.printf("Fetch: %s %d\n", urlp, i);
try {
fetch(urlp);
//Thread.sleep(10000);
} catch (Exception ex) {
Logger.getLogger(Fetch.class.getName()).log(Level.SEVERE, null, ex);
}
 
Crl.fetch_threads.decrementAndGet();
}
 
void fetch(String urlp) throws SQLException {
try {
 
URL url = new URL(urlp);
 
boolean rc;
String dom = null;
try {
dom = Crl.getUrlDomain(urlp);
} catch (URISyntaxException ex) {
Logger.getLogger(Fetch.class.getName()).log(Level.SEVERE, null, ex);
}
rc = Crl.addDomainConnection(dom);
if (!rc) {
// System.out.printf("TOO MANY CONNECTIONS TO %s\n",dom);
Crl.db.endJob(urlp, 5 * 60 + random.nextInt(600)); // retry in 5 minutes
return;
}
 
URLConnection urlConnection = url.openConnection();
Map<String, List<String>> headers = urlConnection.getHeaderFields();
Set<Map.Entry<String, List<String>>> entrySet = headers.entrySet();
boolean ctfound = false;
for (Map.Entry<String, List<String>> entry : entrySet) {
String headerName = entry.getKey();
// System.out.println("Header Name:" + headerName);
if (headerName == null) {
continue;
}
if (headerName.compareTo("Content-Type") != 0) {
continue;
}
 
// System.out.println("Header Name:" + headerName);
List<String> headerValues = entry.getValue();
 
for (String value : headerValues) {
// System.out.print("Header value:" + value);
if (value.contains("text/html")) {
ctfound = true;
}
}
// System.out.println();
// System.out.println();
 
}
 
if (!ctfound) {
// System.out.printf("No text/html\n");
Crl.db.endJob(urlp, 3600 * 24);
Crl.delDomainConnection(dom);
return;
}
 
InputStream inputStream = null;
try {
inputStream = urlConnection.getInputStream();
} catch (Exception ex) {
System.out.printf("FETCH ERROR: %s\n", ex.getCause());
Crl.db.endJob(urlp, 3600 * 24);
Crl.delDomainConnection(dom);
return;
}
// BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
 
BufferedInputStream bufferedReader = new BufferedInputStream(inputStream);
//(new InputStreamReader(inputStream));
 
byte[] buf = new byte[128 * 1024];
String fileName = null;
try {
fileName = Crl.getUrlFileName(urlp);
} catch (Exception ex) {
Logger.getLogger(Fetch.class.getName()).log(Level.SEVERE, null, ex);
}
FileOutputStream outputStream = new FileOutputStream(fileName);
 
while (true) {
int r = bufferedReader.read(buf);
// System.out.printf("Read: %d\n",r);
if (r == -1) {
break;
}
outputStream.write(buf, 0, r);
}
outputStream.close();
bufferedReader.close();
Crl.db.addPage(urlp);
Crl.delDomainConnection(dom);
} catch (MalformedURLException e) {
 
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
 
System.out.printf("FETCHED (%d,%d): %s\n",
Crl.fetch_threads.get(),
Crl.parse_threads.get(),
urlp);
Crl.db.endJob(urlp, 3600 * 24);
 
}
}
/tags/Crl/0.0.1a/src/main/java/org/planix/crl/Parser.java
0,0 → 1,122
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package org.planix.crl;
 
import java.io.*;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.sql.Array;
import java.sql.SQLException;
import java.time.Instant;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;
 
import org.jsoup.*;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
 
/**
*
* @author tube
*/
public class Parser extends Thread {
 
String urlp;
int i;
Random random;
 
Parser(String url, int i) {
this.i = i;
this.urlp = url;
random = new Random();
}
 
@Override
public void run() {
// Crl.parse_threads.incrementAndGet();
// System.out.printf("Parse: %s %d\n", urlp, i);
try {
parse(urlp);
//Thread.sleep(10000);
} catch (Exception ex) {
Logger.getLogger(Fetch.class.getName()).log(Level.SEVERE, null, ex);
}
System.out.printf("PARSED: %s\n", urlp);
Crl.parse_threads.decrementAndGet();
}
 
void parse(String urlp) throws SQLException, Exception {
String filename = Crl.getUrlFileName(urlp);
File input = new File(filename);
Document doc = Jsoup.parse(input, "UTF-8", urlp);
getPageLinks(doc, urlp);
Crl.db.endParseJob(urlp);
}
 
public void getPageLinks(Document doc, String urlp) throws SQLException {
 
//2. Fetch the HTML code
//3. Parse the HTML to extract links to other URLs
Elements linksOnPage = doc.select("a[href]");
//System.out.printf("INSERTING LINKS %s\n", urlp);
 
ArrayList<String> urls_a = new ArrayList<>();
ArrayList<Long> prios_a= new ArrayList<>();
//5. For each extracted URL... go back to Step 4.
int n=0;
for (Element page : linksOnPage) {
long prio;
String link = page.attr("abs:href");
URI uri;
String dom; // = Crl.suffixList.getRegistrableDomain(uri.getHost());
 
try {
link = Crl.removeUrlFragment(link);
uri = new URI(link);
dom = Crl.getUrlDomain(link);
} catch (URISyntaxException ex) {
// System.out.printf("BAD URI: %s\n",link);
// Logger.getLogger(Parser.class.getName()).log(Level.SEVERE, null, ex);
continue;
}
boolean rc;
rc = Crl.db.addDomain(dom);
// rc=true;
String tld = Crl.suffixList.getPublicSuffix(dom);
if (tld==null)
continue;
if (tld.compareTo("de")!=0)
continue;
if (rc) {
prio = 0;
}else
{
prio = 1 + random.nextInt(1000000); //Instant.now().getEpochSecond()+30+n;
}
prios_a.add(prio);
urls_a.add(link);
// Crl.db.insert(prio, link);
// System.out.printf("Insert Link: %d %s\n",prio, page.attr("abs:href"));
n++;
}
Crl.db.insertMany(prios_a, urls_a);
 
}
 
}
/tags/Crl/0.0.1a
Property changes:
Added: svn:ignore
## -0,0 +1 ##
+target
Added: svn:mergeinfo
## -0,0 +0,5 ##
Merged /branches/bnf/Crl:r79-129
Merged /branches/mtr/Crl:r343-348
Merged /branches/crl-java/Crl/Crl:r363-383
Merged /branches/dq/Crl:r41-288
Merged /branches/crl-tika/Crl/Crl:r358