diff --git a/.gitignore b/.gitignore index 016f1f5..f2fd926 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ hs_err_pid* .project .classpath .settings/ +.idea diff --git a/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java b/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java index 1f1bc0e..fb0ee1f 100644 --- a/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java +++ b/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java @@ -68,6 +68,8 @@ public class HostToDomainGraph { protected boolean countHosts = false; protected boolean privateDomains = false; + + protected boolean stripWww = false; protected boolean includeMultiPartSuffixes = false; protected long maxSize; @@ -84,6 +86,13 @@ public class HostToDomainGraph { private static Pattern SPLIT_HOST_PATTERN = Pattern.compile("\\."); + public final static String AGGREGATION_HOST_WITHOUT_WWW = "host-without-www"; + public final static String AGGREGATION_PRIVATE_DOMAIN = "private-domain"; + public final static String AGGREGATION_REGISTERED_DOMAIN = "registered-domain"; + + private final static List ALLOWED_AGGREGATION_PARAMS = java.util.Arrays + .asList(AGGREGATION_REGISTERED_DOMAIN, AGGREGATION_PRIVATE_DOMAIN, AGGREGATION_HOST_WITHOUT_WWW); + private Consumer reporterInputNodes = (String line) -> { if ((numInputLinesNodes % 500000) != 0 || numInputLinesNodes == 0) { return; @@ -281,6 +290,13 @@ public void multiPartSuffixesAsDomains(boolean include) { this.includeMultiPartSuffixes = include; } + /** + * @param stripWww if true the www. prefix is stripped + */ + public void setStripWww(boolean stripWww) { + this.stripWww = stripWww; + } + /** * Reverse host name, eg. www.example.com is reversed to * com.example.www. Can also be used to "unreverse" a reversed host @@ -327,13 +343,23 @@ public String convertNode(String line) { } lastRevHost = revHost; String host = reverseHost(revHost); - String domain = EffectiveTldFinder.getAssignedDomain(host, true, !privateDomains); + String domain = null; StringBuilder sb = new StringBuilder(); - if (domain == null && includeMultiPartSuffixes) { - if (EffectiveTldFinder.getEffectiveTLDs().containsKey(host) && host.indexOf('.') != -1) { - LOG.info("Accepting public suffix (containing dot) as domain: {}", host); + if (this.stripWww) { + if (host.startsWith("www.") && host.indexOf('.', 4) != -1) { + // strip leading 'www' to reduce number of "duplicate" hosts, + // but leave at least 2 trailing parts (www.com is a valid domain) + host = host.substring(4); } domain = host; + } else { + domain = EffectiveTldFinder.getAssignedDomain(host, true, !privateDomains); + if (domain == null && includeMultiPartSuffixes) { + if (EffectiveTldFinder.getEffectiveTLDs().containsKey(host) && host.indexOf('.') != -1) { + LOG.info("Accepting public suffix (containing dot) as domain: {}", host); + } + domain = host; + } } if (domain == null) { LOG.warn("No domain for host: {}", host); @@ -499,9 +525,24 @@ private static void showHelp() { System.err.println("Options:"); System.err.println(" -h\t(also -? or --help) show usage message and exit"); System.err.println(" -c\tcount hosts per domain (additional column in "); - System.err.println(" --private-domains\tconvert to private domains (include suffixes from the"); + System.err.println(" --private-domains\t(deprecated - use --aggregation-level)"); + System.err.println(" \tconvert to private domains (include suffixes from the"); System.err.println(" \tPRIVATE domains subdivision of the public suffix list,"); - System.err.println(" \tsee https://github.com/publicsuffix/list/wiki/Format#divisions"); + System.err.println(" \tsee https://github.com/publicsuffix/list/wiki/Format#divisions)"); + System.err.println(" --aggregation-level \tdefine the strategy on which hosts are folded to domains."); + System.err + .println(" \t values: registered-domain (default), private-domain, "); + System.err.println(" \thost-without-www. "); + System.err.println(" \t- registered-domain: convert only the registered domains "); + System.err.println(" \t- private-domain: convert to private domains "); + System.err.println( + " \t(include suffixes from the PRIVATE domains subdivision of the "); + System.err.println(" \tpublic suffix list, "); + System.err.println( + " \tsee https://github.com/publicsuffix/list/wiki/Format#divisions)"); + System.err + .println(" \t- host-without-www: strip the www. prefix (keep the "); + System.err.println(" \tfull host otherwise)"); System.err.println(" --multipart-suffixes-as-domains\toutput host names which are equal to multi-part"); System.err.println(" \tpublic suffixes (the suffix contains a dot) as domain"); System.err.println(" \tnames, eg. `gov.uk', `freight.aero' or `altoadige.it'."); @@ -512,6 +553,8 @@ public static void main(String[] args) { boolean countHosts = false; boolean includeMultiPartSuffixes = false; boolean privateDomains = false; + String aggregationLevel = null; + boolean stripWww = false; int argpos = 0; while (argpos < args.length && args[argpos].startsWith("-")) { switch (args[argpos]) { @@ -528,9 +571,28 @@ public static void main(String[] args) { includeMultiPartSuffixes = true; break; case "--private-domains": - case "--private": // back-ward compatibility + case "--private": // back-ward compatibility (but deprecated in favour of --aggregation-level) + LOG.warn( + "The parameter --private / --private-domains is deprecated, in favour of --aggregation-level with value private-domain"); privateDomains = true; break; + case "--aggregation-level": + if ((argpos + 1) >= args.length) { + LOG.error("Missing value for option " + args[argpos]); + showHelp(); + System.exit(1); + } + String value = args[argpos + 1]; + + if (!ALLOWED_AGGREGATION_PARAMS.contains(value)) { + LOG.error("Unknown value for option " + args[argpos] + ": " + value); + showHelp(); + System.exit(1); + } else { + aggregationLevel = value; + } + argpos++; + break; default: System.err.println("Unknown option " + args[argpos]); showHelp(); @@ -549,15 +611,37 @@ public static void main(String[] args) { LOG.error("Invalid number: " + args[argpos + 0]); System.exit(1); } + if (aggregationLevel != null) { + if (privateDomains) { + LOG.error( + "You cannot specify both --private or --private-domains, and --aggregation-level. " + + "Prefer --aggregation-level [level] because it will supersede the other option."); + System.exit(1); + } else { + switch (aggregationLevel) { + case AGGREGATION_REGISTERED_DOMAIN: + break; + case AGGREGATION_PRIVATE_DOMAIN: + privateDomains = true; + break; + case AGGREGATION_HOST_WITHOUT_WWW: + stripWww = true; + break; + } + } + } + HostToDomainGraph converter; if (maxSize <= Arrays.MAX_ARRAY_SIZE) { converter = new HostToDomainGraph((int) maxSize); } else { converter = new HostToDomainGraphBig(maxSize); } + converter.doCount(countHosts); converter.multiPartSuffixesAsDomains(includeMultiPartSuffixes); converter.doPrivateDomains(privateDomains); + converter.setStripWww(stripWww); converter.reportConfig(); String nodesIn = args[argpos + 1]; String nodesOut = args[argpos + 2]; diff --git a/src/script/host2domaingraph.sh b/src/script/host2domaingraph.sh index 95c895d..a339b53 100755 --- a/src/script/host2domaingraph.sh +++ b/src/script/host2domaingraph.sh @@ -11,6 +11,13 @@ while true; do PROPERTIES=("${PROPERTIES[@]}" "$1") shift ;; + "--aggregation-level" ) + FLAGS=("${FLAGS[@]}" "$1") + shift + # takes one argument + FLAGS=("${FLAGS[@]}" "$1") + shift + ;; "-"* ) FLAGS=("${FLAGS[@]}" "$1") shift diff --git a/src/test/java/org/commoncrawl/webgraph/TestHostToDomainGraph.java b/src/test/java/org/commoncrawl/webgraph/TestHostToDomainGraph.java index f6e8df1..147b706 100644 --- a/src/test/java/org/commoncrawl/webgraph/TestHostToDomainGraph.java +++ b/src/test/java/org/commoncrawl/webgraph/TestHostToDomainGraph.java @@ -139,6 +139,26 @@ class TestHostToDomainGraph { "8\tname.hit\t1", // }; + String[] hostGraphWithWwwDomains = { // + "0\tname.hiro", // + "1\tname.hiro.adam", // + "2\tname.hiro.www", // + "3\tname.his.forgot.ben.www", // + "4\tname.his.forgot.never", // + "5\tname.his.prz.www", // + "6\tname.his.www", // + "7\tname.hit.www", // + }; + String[] domainGraphWithWwwDomains = { // + "0\tname.hiro\t2", // + "1\tname.hiro.adam\t1", // + "2\tname.his\t1", // + "3\tname.his.forgot.ben\t1", // + "4\tname.his.forgot.never\t1", // + "5\tname.his.prz\t1", // + "6\tname.hit\t1", // + }; + @BeforeEach void init() { converter = new HostToDomainGraph(maxGraphNodes); @@ -267,4 +287,33 @@ void testConvertPrivateDomain() { assertArrayEquals(domainGraphPrivateDomains, convert(converter, hostGraphPrivateDomains)); } + @Test + void testConvertStripWww() { + // verify sorting of input and expected output + testSorted(hostGraphWithWwwDomains); + testSorted(domainGraphWithWwwDomains); + converter.doCount(true); + converter.setStripWww(true); + converter.multiPartSuffixesAsDomains(true); + String[] convert = convert(converter, hostGraphWithWwwDomains); + assertArrayEquals(domainGraphWithWwwDomains, convert); + } + + /** + * Test that www.com is not stripped (only one trailing part after www.) + */ + @Test + void testConvertStripWwwEdgeCaseWwwDotCom() { + String[] hostGraph = { // + "0\tcom.www", // + }; + String[] expectedDomainGraph = { // + "0\tcom.www\t1", // + }; + converter.doCount(true); + converter.setStripWww(true); + String[] convert = convert(converter, hostGraph); + assertArrayEquals(expectedDomainGraph, convert); + } + }