001 /**
002 * Copyright (C) 2007-2009, Jens Lehmann
003 *
004 * This file is part of DL-Learner.
005 *
006 * DL-Learner is free software; you can redistribute it and/or modify
007 * it under the terms of the GNU General Public License as published by
008 * the Free Software Foundation; either version 3 of the License, or
009 * (at your option) any later version.
010 *
011 * DL-Learner is distributed in the hope that it will be useful,
012 * but WITHOUT ANY WARRANTY; without even the implied warranty of
013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
014 * GNU General Public License for more details.
015 *
016 * You should have received a copy of the GNU General Public License
017 * along with this program. If not, see <http://www.gnu.org/licenses/>.
018 *
019 */
020 package org.dllearner.algorithms.refinement2;
021
022 import java.io.File;
023 import java.text.DecimalFormat;
024 import java.util.HashSet;
025 import java.util.Iterator;
026 import java.util.LinkedList;
027 import java.util.List;
028 import java.util.Map;
029 import java.util.NavigableSet;
030 import java.util.Set;
031 import java.util.SortedSet;
032 import java.util.TreeSet;
033 import java.util.concurrent.ConcurrentSkipListSet;
034
035 import org.apache.log4j.Logger;
036 import org.dllearner.core.LearningProblem;
037 import org.dllearner.core.ReasonerComponent;
038 import org.dllearner.core.configurators.ROLComponent2Configurator;
039 import org.dllearner.core.owl.Description;
040 import org.dllearner.core.owl.Individual;
041 import org.dllearner.core.owl.Intersection;
042 import org.dllearner.core.owl.Thing;
043 import org.dllearner.core.owl.Union;
044 import org.dllearner.learningproblems.EvaluatedDescriptionPosNeg;
045 import org.dllearner.learningproblems.PosNegLP;
046 import org.dllearner.learningproblems.ScorePosNeg;
047 import org.dllearner.refinementoperators.RefinementOperator;
048 import org.dllearner.refinementoperators.RhoDRDown;
049 import org.dllearner.utilities.Files;
050 import org.dllearner.utilities.Helper;
051 import org.dllearner.utilities.JamonMonitorLogger;
052 import org.dllearner.utilities.owl.ConceptComparator;
053 import org.dllearner.utilities.owl.ConceptTransformation;
054 import org.dllearner.utilities.owl.EvaluatedDescriptionPosNegComparator;
055
056 import com.jamonapi.Monitor;
057
058 /**
059 * Implements the 2nd version of the refinement operator based learning approach.
060 *
061 * @author Jens Lehmann
062 *
063 */
064 public class ROLearner2 {
065
066 private static Logger logger = Logger.getLogger(ROLearner2.class);
067 private ROLComponent2Configurator configurator;
068
069 // basic setup: learning problem and reasoning service
070 private ReasonerComponent rs;
071 // often the learning problems needn't be accessed directly; instead
072 // use the example sets below and the posonly variable
073 private PosNegLP learningProblem;
074 private Description startDescription;
075 private int nrOfExamples;
076 private int nrOfPositiveExamples;
077 private Set<Individual> positiveExamples;
078 private int nrOfNegativeExamples;
079 private Set<Individual> negativeExamples;
080
081 // noise regulates how many positives can be misclassified and when the
082 // algorithm terminates
083 private double noise = 0.0;
084 private int allowedMisclassifications = 0;
085
086 // positive only learning options:
087 // if no negatives are given, then one possible strategy is to find a very
088 // special concept still entailing all positive examples;
089 // this is realised by changing the termination criterion: a concept is a
090 // solution if it has been expanded x times (x is
091 // configurable) but no more special concept is found (all are either
092 // equivalent or too weak)
093 private int maxPosOnlyExpansion;
094
095 // search tree options
096 private boolean writeSearchTree;
097 private File searchTreeFile;
098 private boolean replaceSearchTree = false;
099
100 // constructs to improve performance
101 private boolean useTooWeakList = true;
102 private boolean useOverlyGeneralList = true;
103 private boolean useShortConceptConstruction = true;
104
105 // extended Options
106 private long maxExecutionTimeInSeconds;
107 private boolean maxExecutionTimeAlreadyReached = false;
108 private long minExecutionTimeInSeconds = 0;
109 private boolean minExecutionTimeAlreadyReached = false;
110 private int guaranteeXgoodDescriptions = 1;
111 private boolean guaranteeXgoodAlreadyReached = false;
112 private int maxClassDescriptionTests;
113
114 // if set to false we do not test properness; this may seem wrong
115 // but the disadvantage of properness testing are additional reasoner
116 // queries and a search bias towards ALL r.something because
117 // ALL r.TOP is improper and automatically expanded further
118 private boolean usePropernessChecks = false;
119
120 // tree traversal means to run through the most promising concepts
121 // and connect them in an intersection to find a solution
122 // (this is called irregularly e.g. every 100 seconds)
123 private boolean useTreeTraversal = false;
124
125 // if this variable is set to true, then the refinement operator
126 // is applied until all concept of equal length have been found
127 // e.g. TOP -> A1 -> A2 -> A3 is found in one loop; the disadvantage
128 // are potentially more method calls, but the advantage is that
129 // the algorithm is better in locating relevant concept in the
130 // subsumption hierarchy (otherwise, if the most general concept
131 // is not promising, it may never get expanded)
132 private boolean forceRefinementLengthIncrease;
133
134 // candidate reduction: using this mechanism we can simulate
135 // the divide&conquer approach in many ILP programs using a
136 // clause by clause search; after a period of time the candidate
137 // set is reduced to focus CPU time on the most promising concepts
138 private boolean useCandidateReduction = true;
139 private int candidatePostReductionSize = 30;
140
141 // setting to true gracefully stops the algorithm
142 private boolean stop = false;
143 private boolean isRunning = false;
144
145 // node from which algorithm has started
146 private ExampleBasedNode startNode;
147
148 // solution protocol
149 private List<ExampleBasedNode> solutions = new LinkedList<ExampleBasedNode>();
150
151 // used refinement operator and heuristic (exchangeable)
152 private RhoDRDown operator;
153 // private RefinementOperator operator;
154 // private ExampleBasedHeuristic heuristic;
155
156 // specifies whether to compute and log benchmark information
157 private boolean computeBenchmarkInformation = false;
158
159 // comparator used to maintain a stable ordering of nodes, i.e.
160 // an ordering which does not change during the run of the algorithm
161 private NodeComparatorStable nodeComparatorStable = new NodeComparatorStable();
162 // stable candidate set; it has no functional part in the algorithm,
163 // but is a list of the currently best concepts found;
164 // it is very important to use a concurrent set here as other threads will
165 // access it (usual iterating is likely to throw a ConcurrentModificationException)
166 private NavigableSet<ExampleBasedNode> candidatesStable = new ConcurrentSkipListSet<ExampleBasedNode>(
167 nodeComparatorStable);
168 // evaluated descriptions
169 // private EvaluatedDescriptionSet evaluatedDescriptions = new EvaluatedDescriptionSet(LearningAlgorithm.MAX_NR_OF_RESULTS);
170
171 // comparator used to create ordered sets of concepts
172 private ConceptComparator conceptComparator = new ConceptComparator();
173 // comparator for evaluated descriptions
174 private EvaluatedDescriptionPosNegComparator edComparator = new EvaluatedDescriptionPosNegComparator();
175
176 // utility variables
177 private DecimalFormat df = new DecimalFormat();
178
179 // candidates for refinement (used for directly accessing
180 // nodes in the search tree)
181 private TreeSet<ExampleBasedNode> candidates;
182
183 // new nodes found during a run of the algorithm
184 private List<ExampleBasedNode> newCandidates = new LinkedList<ExampleBasedNode>();
185
186 // all concepts which have been evaluated as being proper refinements
187 private SortedSet<Description> properRefinements = new TreeSet<Description>(conceptComparator);
188
189 // blacklists
190 private SortedSet<Description> tooWeakList = new TreeSet<Description>(conceptComparator);
191 private SortedSet<Description> overlyGeneralList = new TreeSet<Description>(conceptComparator);
192
193 // set of expanded nodes (TODO: better explanation)
194 TreeSet<ExampleBasedNode> expandedNodes = new TreeSet<ExampleBasedNode>(nodeComparatorStable);
195
196 // statistic variables
197 private int maxRecDepth = 0;
198 private int maxNrOfRefinements = 0;
199 private int maxNrOfChildren = 0;
200 private int redundantConcepts = 0;
201 private int propernessTestsReasoner = 0;
202 private int propernessTestsAvoidedByShortConceptConstruction = 0;
203 private int propernessTestsAvoidedByTooWeakList = 0;
204 private int conceptTestsTooWeakList = 0;
205 private int conceptTestsOverlyGeneralList = 0;
206 private int conceptTestsReasoner = 0;
207
208 // time variables
209 private long runtime;
210 private long algorithmStartTime;
211 private long propernessCalcTimeNs = 0;
212 private long propernessCalcReasoningTimeNs = 0;
213 private long childConceptsDeletionTimeNs = 0;
214 private long refinementCalcTimeNs = 0;
215 private long redundancyCheckTimeNs = 0;
216 private long evaluateSetCreationTimeNs = 0;
217 private long improperConceptsRemovalTimeNs = 0;
218
219 // prefixes
220 private String baseURI;
221 private Map<String, String> prefixes;
222
223 public ROLearner2(
224 ROLComponent2Configurator configurator,
225 LearningProblem learningProblem,
226 ReasonerComponent rs,
227 RefinementOperator operator,
228 ExampleBasedHeuristic heuristic,
229 Description startDescription,
230 // Set<AtomicConcept> allowedConcepts,
231 // Set<AtomicRole> allowedRoles,
232 double noise, boolean writeSearchTree, boolean replaceSearchTree, File searchTreeFile,
233 boolean useTooWeakList, boolean useOverlyGeneralList,
234 boolean useShortConceptConstruction, boolean usePropernessChecks,
235 int maxPosOnlyExpansion, int maxExecutionTimeInSeconds, int minExecutionTimeInSeconds,
236 int guaranteeXgoodDescriptions, int maxClassDescriptionTests, boolean forceRefinementLengthIncrease) {
237
238
239 PosNegLP lp = (PosNegLP) learningProblem;
240 this.learningProblem = lp;
241 positiveExamples = lp.getPositiveExamples();
242 negativeExamples = lp.getNegativeExamples();
243 nrOfPositiveExamples = positiveExamples.size();
244 nrOfNegativeExamples = negativeExamples.size();
245
246 // System.out.println(nrOfPositiveExamples);
247 // System.out.println(nrOfNegativeExamples);
248 // System.exit(0);
249
250 this.configurator = configurator;
251 nrOfExamples = nrOfPositiveExamples + nrOfNegativeExamples;
252 this.rs = rs;
253 this.operator = (RhoDRDown) operator;
254 this.startDescription = startDescription;
255 // initialise candidate set with heuristic as ordering
256 candidates = new TreeSet<ExampleBasedNode>(heuristic);
257 this.noise = noise;
258 this.writeSearchTree = writeSearchTree;
259 this.replaceSearchTree = replaceSearchTree;
260 this.searchTreeFile = searchTreeFile;
261 this.useTooWeakList = useTooWeakList;
262 this.useOverlyGeneralList = useOverlyGeneralList;
263 this.useShortConceptConstruction = useShortConceptConstruction;
264 this.usePropernessChecks = usePropernessChecks;
265 baseURI = rs.getBaseURI();
266 prefixes = rs.getPrefixes();
267 this.maxPosOnlyExpansion = maxPosOnlyExpansion;
268 this.maxExecutionTimeInSeconds = maxExecutionTimeInSeconds;
269 this.minExecutionTimeInSeconds = minExecutionTimeInSeconds;
270 this.guaranteeXgoodDescriptions = guaranteeXgoodDescriptions;
271 this.maxClassDescriptionTests = maxClassDescriptionTests;
272 this.forceRefinementLengthIncrease = forceRefinementLengthIncrease;
273
274 // logger.setLevel(Level.DEBUG);
275 }
276
277 public void start() {
278 stop = false;
279 isRunning = true;
280 runtime = System.currentTimeMillis();
281
282 // reset values (algorithms may be started several times)
283 candidates.clear();
284 candidatesStable.clear();
285 newCandidates.clear();
286 solutions.clear();
287 maxExecutionTimeAlreadyReached = false;
288 minExecutionTimeAlreadyReached = false;
289 guaranteeXgoodAlreadyReached = false;
290 propernessTestsReasoner = 0;
291 propernessTestsAvoidedByShortConceptConstruction = 0;
292 propernessTestsAvoidedByTooWeakList = 0;
293 conceptTestsTooWeakList = 0;
294 conceptTestsOverlyGeneralList = 0;
295 propernessCalcTimeNs = 0;
296 propernessCalcReasoningTimeNs = 0;
297 childConceptsDeletionTimeNs = 0;
298 refinementCalcTimeNs = 0;
299 redundancyCheckTimeNs = 0;
300 evaluateSetCreationTimeNs = 0;
301 improperConceptsRemovalTimeNs = 0;
302
303 Monitor totalLearningTime = JamonMonitorLogger.getTimeMonitor(ROLComponent2.class, "totalLearningTime")
304 .start();
305 // TODO: write a JUnit test for this problem (long-lasting or infinite
306 // loops because
307 // redundant children of a node are called recursively when a node is
308 // extended
309 // twice)
310 /*
311 * // String conceptStr =
312 * "(\"http://dl-learner.org/carcinogenesis#Compound\" AND (>= 2
313 * \"http://dl-learner.org/carcinogenesis#hasStructure\".\"http://dl-learner.org/carcinogenesis#Ar_halide\"
314 * OR ((\"http://dl-learner.org/carcinogenesis#amesTestPositive\" IS
315 * TRUE) AND >= 5 \"http://dl-learner.org/carcinogenesis#hasBond\".
316 * TOP)))"; // String conceptStr =
317 * "(\"http://dl-learner.org/carcinogenesis#Compound\" AND
318 * ((\"http://dl-learner.org/carcinogenesis#amesTestPositive\" IS TRUE)
319 * AND (\"http://dl-learner.org/carcinogenesis#amesTestPositive\" IS
320 * TRUE)))"; String conceptStr =
321 * "(\"http://dl-learner.org/carcinogenesis#Compound\" AND (>= 3
322 * \"http://dl-learner.org/carcinogenesis#hasStructure\".\"http://dl-learner.org/carcinogenesis#Halide\"
323 * OR ((\"http://dl-learner.org/carcinogenesis#amesTestPositive\" IS
324 * TRUE) AND ALL
325 * \"http://dl-learner.org/carcinogenesis#hasAtom\".TOP)))"; String
326 * conceptStr2 = "(\"http://dl-learner.org/carcinogenesis#Compound\" AND
327 * (>= 4
328 * \"http://dl-learner.org/carcinogenesis#hasStructure\".\"http://dl-learner.org/carcinogenesis#Halide\"
329 * OR ((\"http://dl-learner.org/carcinogenesis#amesTestPositive\" IS
330 * TRUE) AND ALL
331 * \"http://dl-learner.org/carcinogenesis#hasAtom\".TOP)))"; try {
332 * NamedClass struc = new
333 * NamedClass("http://dl-learner.org/carcinogenesis#Compound");
334 * Description d = KBParser.parseConcept(conceptStr); Description d2 =
335 * KBParser.parseConcept(conceptStr2); // SortedSet<Description> ds =
336 * (SortedSet<Description>) operator.refine(d,15,null,struc); //
337 * System.out.println(ds);
338 * // System.out.println(RhoDRDown.checkIntersection((Intersection)d));
339 *
340 *
341 * Set<Individual> coveredNegatives = rs.instanceCheck(d,
342 * learningProblem.getNegativeExamples()); Set<Individual>
343 * coveredPositives = rs.instanceCheck(d,
344 * learningProblem.getPositiveExamples()); ExampleBasedNode ebn = new
345 * ExampleBasedNode(d); ebn.setCoveredExamples(coveredPositives,
346 * coveredNegatives);
347 *
348 * properRefinements.add(d2); extendNodeProper(ebn,13);
349 * extendNodeProper(ebn,14); for(Description refinement:
350 * ebn.getChildConcepts()) System.out.println("refinement: " +
351 * refinement);
352 * // Individual i = new
353 * Individual("http://dl-learner.org/carcinogenesis#d101"); //
354 * for(Individual i : learningProblem.getPositiveExamples()) //
355 * rs.instanceCheck(ds.last(), i);
356 *
357 * System.out.println("finished"); } catch (ParseException e) { // TODO
358 * Auto-generated catch block e.printStackTrace(); } System.exit(0);
359 */
360
361 // calculate quality threshold required for a solution
362 allowedMisclassifications = (int) Math.round(noise * nrOfExamples);
363
364 // start search with start class
365 if (startDescription == null) {
366 startNode = new ExampleBasedNode(configurator, Thing.instance);
367 startNode.setCoveredExamples(positiveExamples, negativeExamples);
368 } else {
369 startNode = new ExampleBasedNode(configurator, startDescription);
370 Set<Individual> coveredNegatives = rs.hasType(startDescription, negativeExamples);
371 Set<Individual> coveredPositives = rs.hasType(startDescription, positiveExamples);
372 startNode.setCoveredExamples(coveredPositives, coveredNegatives);
373 }
374
375 candidates.add(startNode);
376 candidatesStable.add(startNode);
377
378 ExampleBasedNode bestNode = startNode;
379 ExampleBasedNode bestNodeStable = startNode;
380
381 logger.info("starting top down refinement with: " + startNode.getConcept().toManchesterSyntaxString(baseURI, prefixes) + " (" + df.format(100*startNode.getAccuracy(nrOfPositiveExamples, nrOfNegativeExamples)) + "% accuracy)");
382
383 int loop = 0;
384
385 algorithmStartTime = System.nanoTime();
386 long lastPrintTime = 0;
387 long lastTreeTraversalTime = System.nanoTime();
388 long lastReductionTime = System.nanoTime();
389 // try a traversal after x seconds
390 long traversalInterval = 300l * 1000000000l;
391 long reductionInterval = 300l * 1000000000l;
392 long currentTime;
393
394 while (!isTerminationCriteriaReached()) {
395 //while ((!solutionFound || !configurator.getTerminateOnNoiseReached() ) && !stop) {
396
397 // print statistics at most once a second
398 currentTime = System.nanoTime();
399 if (currentTime - lastPrintTime > 3000000000l) {
400 printStatistics(false);
401 lastPrintTime = currentTime;
402 logger.debug("--- loop " + loop + " started ---");
403 // logger.info("used memory: " + (Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory())/1024);
404 // logger.info("test: " + ExampleBasedNode.exampleMemoryCounter/1024);
405 }
406
407 // traverse the current search tree to find a solution
408 if (useTreeTraversal && (currentTime - lastTreeTraversalTime > traversalInterval)) {
409 traverseTree();
410 lastTreeTraversalTime = System.nanoTime();
411 }
412
413 // reduce candidates to focus on promising concepts
414 if (useCandidateReduction && (currentTime - lastReductionTime > reductionInterval)) {
415 reduceCandidates();
416 lastReductionTime = System.nanoTime();
417 // Logger.getRootLogger().setLevel(Level.TRACE);
418 }
419
420 // System.out.println("next expanded: " +
421 // candidates.last().getShortDescription(nrOfPositiveExamples,
422 // nrOfNegativeExamples, baseURI));
423
424 // we record when a more accurate node is found and log it
425 if (bestNodeStable.getCovPosMinusCovNeg() < candidatesStable.last()
426 .getCovPosMinusCovNeg()) {
427 String acc = new DecimalFormat( ".00%" ).format((candidatesStable.last().getAccuracy(nrOfPositiveExamples, nrOfNegativeExamples)));
428 // no handling needed, it will just look ugly in the output
429 logger.info("more accurate ("+acc+") class expression found: " + candidatesStable.last().getConcept().toManchesterSyntaxString(baseURI, prefixes));
430 bestNodeStable = candidatesStable.last();
431 }
432
433 // chose best node according to heuristics
434 bestNode = candidates.last();
435 // extend best node
436 newCandidates.clear();
437 // best node is removed temporarily, because extending it can
438 // change its evaluation
439 candidates.remove(bestNode);
440 extendNodeProper(bestNode, bestNode.getHorizontalExpansion() + 1);
441 candidates.add(bestNode);
442 // newCandidates has been filled during node expansion
443 candidates.addAll(newCandidates);
444 candidatesStable.addAll(newCandidates);
445
446 // System.out.println("done");
447
448 if (writeSearchTree) {
449 // String treeString = "";
450 String treeString = "best node: " + bestNode + "\n";
451 if (expandedNodes.size() > 1) {
452 treeString += "all expanded nodes:\n";
453 for (ExampleBasedNode n : expandedNodes) {
454 treeString += " " + n + "\n";
455 }
456 }
457 expandedNodes.clear();
458 treeString += startNode.getTreeString(nrOfPositiveExamples, nrOfNegativeExamples,
459 baseURI);
460 treeString += "\n";
461
462 if (replaceSearchTree)
463 Files.createFile(searchTreeFile, treeString);
464 else
465 Files.appendFile(searchTreeFile, treeString);
466 }
467
468 // special situation for positive only learning: the expanded node
469 // can become a solution (see explanations
470 // for maxPosOnlyExpansion above)
471
472 // DEPRECATED CODE
473 if (false
474 && bestNode.isPosOnlyCandidate()
475 && (bestNode.getHorizontalExpansion() - bestNode.getConcept().getLength() >= maxPosOnlyExpansion)) {
476
477 boolean solution = checkSubtreePosOnly(bestNode);
478
479 if (solution) {
480 solutions.add(bestNode);
481 ExampleBasedNode bestChild = bestNode.getChildren().last();
482 System.out.println("solution: " + bestNode.getConcept());
483 System.out.println("maxPosOnlyExpansion: " + maxPosOnlyExpansion);
484 System.out.println("best child of this node: " + bestChild);
485 if(bestNode.getChildConcepts().size()<100) {
486 System.out.println(bestNode.getChildConcepts());
487 }
488 System.out.println("TODO: needs to be integrated with other stopping criteria");
489 System.out
490 .println("You tried to use this algorithm for positive only learning, which is not recommended (yet).");
491 // System.out.println("Exiting.");
492 // System.exit(0);
493 } else {
494 // tag as non-candidate so we do not need to search again
495 bestNode.setPosOnlyCandidate(false);
496 }
497
498 }
499
500 // Anzahl Schleifendurchläufe
501 loop++;
502 }// end while
503
504 if (solutions.size()>0) {
505 //if (solutionFound) {
506 int solutionLimit = 20;
507 // we do not need to print the best node if we display the top 20 solutions below anyway
508 // logger.info("best node "
509 // + candidatesStable.last().getShortDescription(nrOfPositiveExamples,
510 // nrOfNegativeExamples, baseURI));
511 logger.info("solutions (at most " + solutionLimit + " are shown):");
512 int show = 1;
513 String manchester = "MANCHESTER:\n";
514 String kbSyntax = "KBSyntax:\n";
515 for (ExampleBasedNode c : solutions) {
516 logger.info(show + ": " + c.getConcept().toManchesterSyntaxString(baseURI, prefixes) + " (accuracy " + df.format(100*c.getAccuracy(nrOfPositiveExamples, nrOfNegativeExamples)) + "%, length " + c.getConcept().getLength()
517 + ", depth " + c.getConcept().getDepth() + ")");
518 // manchester += show + ": " + c.toManchesterSyntaxString(baseURI, prefixes) + "\n";
519 kbSyntax += show + ": " + c.getConcept().toKBSyntaxString() + "\n";
520 if (show >= solutionLimit) {
521 break;
522 }
523 show++;
524 }
525 logger.debug(manchester);
526 logger.debug(kbSyntax);
527 }
528
529 logger.debug("size of candidate set: " + candidates.size());
530 boolean showOrderedSolutions = false;
531 printBestSolutions(20, showOrderedSolutions);
532
533 printStatistics(true);
534
535 int conceptTests = conceptTestsReasoner + conceptTestsTooWeakList + conceptTestsOverlyGeneralList;
536 if (stop) {
537 logger.info("Algorithm stopped ("+conceptTests+" descriptions tested).\n");
538 } else {
539 logger.info("Algorithm terminated succesfully ("+conceptTests+" descriptions tested).\n");
540 }
541
542 totalLearningTime.stop();
543 isRunning = false;
544 }
545
546 // we apply the operator recursively until all proper refinements up
547 // to the maxmimum length are reached
548 private void extendNodeProper(ExampleBasedNode node, int maxLength) {
549 long propCalcNsStart = System.nanoTime();
550
551 if (writeSearchTree)
552 expandedNodes.add(node);
553
554 if (node.getChildren().size() > maxNrOfChildren)
555 maxNrOfChildren = node.getChildren().size();
556
557 extendNodeProper(node, node.getConcept(), maxLength, 0);
558 node.setHorizontalExpansion(maxLength);
559
560 propernessCalcTimeNs += (System.nanoTime() - propCalcNsStart);
561 }
562
563 // for all refinements of concept up to max length, we check whether they
564 // are properr
565 // and call the method recursively if not
566 // recDepth is used only for statistics
567 private void extendNodeProper(ExampleBasedNode node, Description concept, int maxLength,
568 int recDepth) {
569
570 // System.out.println("node: " + node);
571 // System.out.println("concept: " + concept);
572
573 // do not execute methods if algorithm has been stopped (this means that
574 // the algorithm
575 // will terminate without further reasoning queries)
576 if (stop)
577 return;
578
579 if (recDepth > maxRecDepth)
580 maxRecDepth = recDepth;
581
582 // compute refinements => we must not delete refinements with low
583 // horizontal expansion,
584 // because they are used in recursive calls of this method later on
585 long refinementCalcTimeNsStart = System.nanoTime();
586 Set<Description> refinements = operator.refine(concept, maxLength, null);
587 refinementCalcTimeNs += System.nanoTime() - refinementCalcTimeNsStart;
588
589 if (refinements.size() > maxNrOfRefinements)
590 maxNrOfRefinements = refinements.size();
591
592 long childConceptsDeletionTimeNsStart = System.nanoTime();
593 // entferne aus den refinements alle Konzepte, die bereits Kinder des
594 // Knotens sind
595 // for(Node n : node.getChildren()) {
596 // refinements.remove(n.getConcept());
597 // }
598
599 // das ist viel schneller, allerdings bekommt man ein anderes candidate
600 // set(??)
601 refinements.removeAll(node.getChildConcepts());
602
603 childConceptsDeletionTimeNs += System.nanoTime() - childConceptsDeletionTimeNsStart;
604
605 // if(refinements.size()<30) {
606 // // System.out.println("refinements: " + refinements);
607 // for(Description refinement: refinements)
608 // System.out.println("refinement: " + refinement);
609 // }
610
611 long evaluateSetCreationTimeNsStart = System.nanoTime();
612
613 // alle Konzepte, die länger als horizontal expansion sind, müssen
614 // ausgewertet
615 // werden
616 TreeSet<Description> toEvaluateConcepts = new TreeSet<Description>(conceptComparator);
617 Iterator<Description> it = refinements.iterator();
618 // for(Concept refinement : refinements) {
619 while (it.hasNext()) {
620
621 Description refinement = it.next();
622 if (refinement.getLength() > node.getHorizontalExpansion()) {
623 // sagt aus, ob festgestellt wurde, ob refinement proper ist
624 // (sagt nicht aus, dass das refinement proper ist!)
625 boolean propernessDetected = false;
626
627 // 1. short concept construction
628 if (useShortConceptConstruction) {
629 // kurzes Konzept konstruieren
630 Description shortConcept = ConceptTransformation.getShortConcept(refinement,
631 conceptComparator);
632 int n = conceptComparator.compare(shortConcept, concept);
633
634 // Konzepte sind gleich also Refinement improper
635 if (n == 0) {
636 propernessTestsAvoidedByShortConceptConstruction++;
637 propernessDetected = true;
638
639 // System.out.println("refinement " + refinement +
640 // " can be shortened");
641 // System.exit(0);
642 }
643 }
644
645 // 2. too weak test
646 if (!propernessDetected && useTooWeakList) {
647 if (refinement instanceof Intersection) {
648 boolean tooWeakElement = containsTooWeakElement((Intersection) refinement);
649 if (tooWeakElement) {
650 propernessTestsAvoidedByTooWeakList++;
651 conceptTestsTooWeakList++;
652 propernessDetected = true;
653 // tooWeakList.add(refinement);
654
655 // Knoten wird direkt erzeugt (es ist buganfällig
656 // zwei Plätze
657 // zu haben, an denen Knoten erzeugt werden, aber es
658 // erscheint
659 // hier am sinnvollsten)
660 properRefinements.add(refinement);
661 tooWeakList.add(refinement);
662
663 ExampleBasedNode newNode = new ExampleBasedNode(configurator, refinement);
664 newNode.setHorizontalExpansion(refinement.getLength() - 1);
665 newNode.setTooWeak(true);
666 newNode
667 .setQualityEvaluationMethod(ExampleBasedNode.QualityEvaluationMethod.TOO_WEAK_LIST);
668 node.addChild(newNode);
669
670 // Refinement muss gelöscht werden, da es proper ist
671 it.remove();
672 }
673 }
674 }
675
676 // properness konnte nicht vorher ermittelt werden
677 if (!propernessDetected) {
678 toEvaluateConcepts.add(refinement);
679 // if(!res) {
680 // System.out.println("already in: " + refinement);
681 // Comparator comp = toEvaluateConcepts.comparator();
682 // for(Description d : toEvaluateConcepts) {
683 // if(comp.compare(d,refinement)==0)
684 // System.out.println("see: " + d);
685 // }
686 // }
687 }
688
689 }
690
691 // System.out.println("handled " + refinement + " length: " +
692 // refinement.getLength() + " (new size: " +
693 // toEvaluateConcepts.size() + ")");
694
695 }
696 evaluateSetCreationTimeNs += System.nanoTime() - evaluateSetCreationTimeNsStart;
697
698 // System.out.println("intermediate 1 " +
699 // node.getShortDescription(nrOfPositiveExamples, nrOfNegativeExamples,
700 // baseURI));
701
702 // System.out.println(toEvaluateConcepts.size());
703
704 Set<Description> improperConcepts = null;
705 if (toEvaluateConcepts.size() > 0) {
706 // Test aller Konzepte auf properness (mit DIG in nur einer Anfrage)
707 if (usePropernessChecks) {
708 long propCalcReasoningStart = System.nanoTime();
709 improperConcepts = rs.isSuperClassOf(toEvaluateConcepts, concept);
710 propernessTestsReasoner += toEvaluateConcepts.size();
711 // boolean isProper =
712 // !learningProblem.getReasonerComponent().subsumes(refinement,
713 // concept);
714 propernessCalcReasoningTimeNs += System.nanoTime() - propCalcReasoningStart;
715 }
716 }
717
718 // if(toEvaluateConcepts.size()<10)
719 // System.out.println("to evaluate: " + toEvaluateConcepts);
720 // else
721 // System.out.println("to evaluate: more than 10");
722
723 long improperConceptsRemovalTimeNsStart = System.nanoTime();
724 // die improper Konzepte werden von den auszuwertenden gelöscht, d.h.
725 // alle proper concepts bleiben übrig (einfache Umbenennung)
726 if (improperConcepts != null)
727 toEvaluateConcepts.removeAll(improperConcepts);
728 Set<Description> properConcepts = toEvaluateConcepts;
729 // alle proper concepts von refinements löschen
730 refinements.removeAll(properConcepts);
731 improperConceptsRemovalTimeNs += System.nanoTime() - improperConceptsRemovalTimeNsStart;
732
733 // if(refinements.size()<10)
734 // System.out.println("refinements: " + refinements);
735 // else
736 // System.out.println("refinements: more than 10");
737 //
738 // System.out.println("improper concepts: " + improperConcepts);
739
740 for (Description refinement : properConcepts) {
741 long redundancyCheckTimeNsStart = System.nanoTime();
742 boolean nonRedundant = properRefinements.add(refinement);
743 redundancyCheckTimeNs += System.nanoTime() - redundancyCheckTimeNsStart;
744
745 if (!nonRedundant)
746 redundantConcepts++;
747
748 // es wird nur ein neuer Knoten erzeugt, falls das Konzept nicht
749 // schon existiert
750 if (nonRedundant) {
751
752 // newly created node
753 ExampleBasedNode newNode = new ExampleBasedNode(configurator, refinement);
754 // die -1 ist wichtig, da sonst keine gleich langen Refinements
755 // für den neuen Knoten erlaubt wären z.B. person => male
756 newNode.setHorizontalExpansion(refinement.getLength() - 1);
757
758 boolean qualityKnown = false;
759 int quality = -2;
760
761 // overly general list verwenden
762 if (useOverlyGeneralList && refinement instanceof Union) {
763 if (containsOverlyGeneralElement((Union) refinement)) {
764 conceptTestsOverlyGeneralList++;
765 // quality = getNumberOfNegatives();
766 quality = nrOfNegativeExamples;
767 qualityKnown = true;
768 newNode
769 .setQualityEvaluationMethod(ExampleBasedNode.QualityEvaluationMethod.OVERLY_GENERAL_LIST);
770 newNode.setCoveredExamples(positiveExamples, negativeExamples);
771 }
772
773 }
774
775 // Qualität des Knotens auswerten
776 if (!qualityKnown) {
777 long propCalcReasoningStart2 = System.nanoTime();
778 conceptTestsReasoner++;
779
780 // quality = coveredNegativesOrTooWeak(refinement);
781
782 // determine individuals which have not been covered yet
783 // (more efficient than full retrieval)
784 Set<Individual> coveredPositives = node.getCoveredPositives();
785 Set<Individual> newlyCoveredPositives = new HashSet<Individual>();
786
787 // calculate how many pos. examples are not covered by the
788 // parent node of the refinement
789 int misclassifiedPositives = nrOfPositiveExamples - coveredPositives.size();
790
791 // iterate through all covered examples (examples which are
792 // not
793 // covered do not need to be tested, because they remain
794 // uncovered);
795 // DIG will be slow if we send each reasoner request
796 // individually
797 // (however if we send everything in one request, too many
798 // instance checks
799 // are performed => rely on fast instance checker)
800 for (Individual i : coveredPositives) {
801 // TODO: move code to a separate function
802 if (quality != -1) {
803 boolean covered = rs.hasType(refinement, i);
804 if (!covered)
805 misclassifiedPositives++;
806 else
807 newlyCoveredPositives.add(i);
808
809 if (misclassifiedPositives > allowedMisclassifications)
810 quality = -1;
811
812 }
813 }
814
815 Set<Individual> newlyCoveredNegatives = null;
816 if (quality != -1) {
817 Set<Individual> coveredNegatives = node.getCoveredNegatives();
818 newlyCoveredNegatives = new HashSet<Individual>();
819
820 for (Individual i : coveredNegatives) {
821 boolean covered = rs.hasType(refinement, i);
822 if (covered)
823 newlyCoveredNegatives.add(i);
824 }
825 }
826
827 propernessCalcReasoningTimeNs += System.nanoTime() - propCalcReasoningStart2;
828 newNode
829 .setQualityEvaluationMethod(ExampleBasedNode.QualityEvaluationMethod.REASONER);
830 if (quality != -1) {
831 // quality is the number of misclassifications (if it is
832 // not too weak)
833 quality = (nrOfPositiveExamples - newlyCoveredPositives.size())
834 + newlyCoveredNegatives.size();
835 newNode.setCoveredExamples(newlyCoveredPositives, newlyCoveredNegatives);
836 }
837
838 }
839
840 if (quality == -1) {
841 newNode.setTooWeak(true);
842 // Blacklist für too weak concepts
843 tooWeakList.add(refinement);
844 } else {
845 // Lösung gefunden
846 if (quality >= 0 && quality <= allowedMisclassifications) {
847 solutions.add(newNode);
848 }
849
850 newCandidates.add(newNode);
851
852 // we need to make sure that all positives are covered
853 // before adding something to the overly general list
854 if ((newNode.getCoveredPositives().size() == nrOfPositiveExamples)
855 && quality == nrOfNegativeExamples)
856 overlyGeneralList.add(refinement);
857
858 }
859
860 // System.out.println(newNode.getConcept() + " " + quality);
861 node.addChild(newNode);
862
863 // it is often useful to continue expanding until a longer node is
864 // reached (to replace atomic concepts with more specific ones)
865 if(forceRefinementLengthIncrease && !newNode.isTooWeak()) {
866 // extend node again if its concept has the same length
867 if(node.getConcept().getLength() == newNode.getConcept().getLength()) {
868 extendNodeProper(newNode, refinement, maxLength, recDepth + 1);
869 }
870 }
871
872 }
873 }
874
875 // es sind jetzt noch alle Konzepte übrig, die improper refinements sind
876 // auf jedem dieser Konzepte wird die Funktion erneut aufgerufen, da
877 // sich
878 // proper refinements ergeben könnten
879 for (Description refinement : refinements) {
880 // for(int i=0; i<=recDepth; i++)
881 // System.out.print(" ");
882 // System.out.println("call: " + refinement + " [maxLength " +
883 // maxLength + ", rec depth " + recDepth + "]");
884
885 // check for redundancy (otherwise we may run into very
886 // time-intensive loops,
887 // see planned JUnit test case $x)
888
889 long redundancyCheckTimeNsStart = System.nanoTime();
890 boolean redundant = properRefinements.contains(refinement);
891 redundancyCheckTimeNs += System.nanoTime() - redundancyCheckTimeNsStart;
892
893 if (!redundant) {
894 // System.out.println("node " + node);
895 // System.out.println("refinement " + refinement);
896
897 extendNodeProper(node, refinement, maxLength, recDepth + 1);
898 }
899
900 // for(int i=0; i<=recDepth; i++)
901 // System.out.print(" ");
902 // System.out.println("finished: " + refinement + " [maxLength " +
903 // maxLength + "]");
904 // System.exit(0);
905 }
906 }
907
908 private void printStatistics(boolean finalStats) {
909 // TODO: viele Tests haben ergeben, dass man nie 100% mit der
910 // Zeitmessung abdecken
911 // kann (zum einen weil Stringausgabe verzögert erfolgt und zum anderen
912 // weil
913 // Funktionsaufrufe, garbage collection, Zeitmessung selbst auch Zeit
914 // benötigt);
915 // es empfiehlt sich folgendes Vorgehen:
916 // - Messung der Zeit eines Loops im Algorithmus
917 // - Messung der Zeit für alle node extensions innerhalb eines Loops
918 // => als Normalisierungsbasis empfehlen sich dann die Loopzeit statt
919 // Algorithmuslaufzeit
920 // ... momentan kann es aber auch erstmal so lassen
921
922 long algorithmRuntime = System.nanoTime() - algorithmStartTime;
923
924 if (!finalStats) {
925
926 ExampleBasedNode bestNode = candidatesStable.last();
927 // double accuracy = 100 * ((bestNode.getCoveredPositives().size()
928 // + nrOfNegativeExamples -
929 // bestNode.getCoveredNegatives().size())/(double)nrOfExamples);
930 // Refinementoperator auf Konzept anwenden
931 // String bestNodeString = "currently best node: " + bestNode + "
932 // accuracy: " + df.format(accuracy) + "%";
933 logger.debug("start node: "
934 + startNode.getShortDescription(nrOfPositiveExamples, nrOfNegativeExamples,
935 baseURI));
936 String bestNodeString = "currently best node: "
937 + bestNode.getShortDescription(nrOfPositiveExamples, nrOfNegativeExamples,
938 baseURI);
939 String bestNodeStringKBSyntax = "currently best node KBSyntax: "
940 + bestNode.getConcept().toKBSyntaxString();
941
942 // searchTree += bestNodeString + "\n";
943 logger.debug(bestNodeString);
944 logger.trace(bestNode.getStats(nrOfPositiveExamples, nrOfNegativeExamples));
945 logger.debug(bestNodeStringKBSyntax);
946 if (bestNode.getCoveredNegatives().size() <= 5)
947 logger.trace("covered negs: " + bestNode.getCoveredNegatives());
948 String expandedNodeString = "next expanded node: "
949 + candidates.last().getShortDescription(nrOfPositiveExamples,
950 nrOfNegativeExamples, baseURI);
951 // searchTree += expandedNodeString + "\n";
952 logger.debug(expandedNodeString);
953 logger.debug("algorithm runtime " + Helper.prettyPrintNanoSeconds(algorithmRuntime));
954 logger.debug("size of candidate set: " + candidates.size());
955 // System.out.println("properness max recursion depth: " +
956 // maxRecDepth);
957 // System.out.println("max. number of one-step refinements: " +
958 // maxNrOfRefinements);
959 // System.out.println("max. number of children of a node: " +
960 // maxNrOfChildren);
961 logger.debug("subsumption time: "
962 + Helper.prettyPrintNanoSeconds(rs.getSubsumptionReasoningTimeNs()));
963 logger.debug("instance check time: "
964 + Helper.prettyPrintNanoSeconds(rs.getInstanceCheckReasoningTimeNs()));
965 logger.debug("retrieval time: "
966 + Helper.prettyPrintNanoSeconds(rs.getRetrievalReasoningTimeNs()));
967 }
968
969 if (computeBenchmarkInformation) {
970
971 long reasoningTime = rs.getOverallReasoningTimeNs();
972 double reasoningPercentage = 100 * reasoningTime / (double) algorithmRuntime;
973 long propWithoutReasoning = propernessCalcTimeNs - propernessCalcReasoningTimeNs;
974 double propPercentage = 100 * propWithoutReasoning / (double) algorithmRuntime;
975 double deletionPercentage = 100 * childConceptsDeletionTimeNs
976 / (double) algorithmRuntime;
977 long subTime = rs.getSubsumptionReasoningTimeNs();
978 double subPercentage = 100 * subTime / (double) algorithmRuntime;
979 double refinementPercentage = 100 * refinementCalcTimeNs / (double) algorithmRuntime;
980 double redundancyCheckPercentage = 100 * redundancyCheckTimeNs
981 / (double) algorithmRuntime;
982 double evaluateSetCreationTimePercentage = 100 * evaluateSetCreationTimeNs
983 / (double) algorithmRuntime;
984 double improperConceptsRemovalTimePercentage = 100 * improperConceptsRemovalTimeNs
985 / (double) algorithmRuntime;
986 double mComputationTimePercentage = 100 * operator.mComputationTimeNs
987 / (double) algorithmRuntime;
988 double topComputationTimePercentage = 100 * operator.topComputationTimeNs
989 / (double) algorithmRuntime;
990 double cleanTimePercentage = 100 * ConceptTransformation.cleaningTimeNs
991 / (double) algorithmRuntime;
992 double onnfTimePercentage = 100 * ConceptTransformation.onnfTimeNs
993 / (double) algorithmRuntime;
994 double shorteningTimePercentage = 100 * ConceptTransformation.shorteningTimeNs
995 / (double) algorithmRuntime;
996
997 logger.debug("reasoning percentage: " + df.format(reasoningPercentage) + "%");
998 logger.debug(" subsumption check time: " + df.format(subPercentage) + "%");
999 logger.debug("proper calculation percentage (wo. reasoning): "
1000 + df.format(propPercentage) + "%");
1001 logger.debug(" deletion time percentage: " + df.format(deletionPercentage) + "%");
1002 logger.debug(" refinement calculation percentage: " + df.format(refinementPercentage)
1003 + "%");
1004 logger.debug(" m calculation percentage: " + df.format(mComputationTimePercentage)
1005 + "%");
1006 logger.debug(" top calculation percentage: "
1007 + df.format(topComputationTimePercentage) + "%");
1008 logger.debug(" redundancy check percentage: " + df.format(redundancyCheckPercentage)
1009 + "%");
1010 logger.debug(" evaluate set creation time percentage: "
1011 + df.format(evaluateSetCreationTimePercentage) + "%");
1012 logger.debug(" improper concepts removal time percentage: "
1013 + df.format(improperConceptsRemovalTimePercentage) + "%");
1014 logger.debug("clean time percentage: " + df.format(cleanTimePercentage) + "%");
1015 logger.debug("onnf time percentage: " + df.format(onnfTimePercentage) + "%");
1016 logger
1017 .debug("shortening time percentage: " + df.format(shorteningTimePercentage)
1018 + "%");
1019 }
1020
1021 logger.debug("properness tests (reasoner/short concept/too weak list): "
1022 + propernessTestsReasoner + "/" + propernessTestsAvoidedByShortConceptConstruction
1023 + "/" + propernessTestsAvoidedByTooWeakList);
1024 logger
1025 .debug("concept tests (reasoner/too weak list/overly general list/redundant concepts): "
1026 + conceptTestsReasoner
1027 + "/"
1028 + conceptTestsTooWeakList
1029 + "/"
1030 + conceptTestsOverlyGeneralList + "/" + redundantConcepts);
1031 }
1032
1033 // @SuppressWarnings({"unused"})
1034 // private int coveredNegativesOrTooWeak(Description concept) {
1035 // if(posOnly)
1036 // return
1037 // posOnlyLearningProblem.coveredPseudoNegativeExamplesOrTooWeak(concept);
1038 // else
1039 // return learningProblem.coveredNegativeExamplesOrTooWeak(concept);
1040 // }
1041
1042 // private int getNumberOfNegatives() {
1043 // if(posOnly)
1044 // return posOnlyLearningProblem.getPseudoNegatives().size();
1045 // else
1046 // return learningProblem.getNegativeExamples().size();
1047 // }
1048
1049 private boolean containsTooWeakElement(Intersection mc) {
1050 for (Description child : mc.getChildren()) {
1051 if (tooWeakList.contains(child))
1052 return true;
1053 }
1054 return false;
1055 }
1056
1057 private boolean containsOverlyGeneralElement(Union md) {
1058 for (Description child : md.getChildren()) {
1059 if (overlyGeneralList.contains(child))
1060 return true;
1061 }
1062 return false;
1063 }
1064
1065 // TODO: investigate whether it makes sense not to store all individuals
1066 // in the nodes, but instead perform instance checks in tree traversal
1067 // (it is only run in large intervals so it shouldn't be too expensive)
1068 private void traverseTree() {
1069 // ExampleBasedNode startNode = candidatesStable.last();
1070 ExampleBasedNode startNode = findBestTraversalStartNode();
1071 Description currentDescription = startNode.getConcept();
1072 Set<Individual> currentCoveredPos = startNode.getCoveredPositives();
1073 Set<Individual> currentCoveredNeg = startNode.getCoveredNegatives();
1074 double currentAccuracy = startNode.getAccuracy(nrOfPositiveExamples, nrOfNegativeExamples);
1075 int currentMisclassifications = nrOfPositiveExamples - currentCoveredPos.size()
1076 + currentCoveredNeg.size();
1077 logger.debug("tree traversal start node "
1078 + startNode
1079 .getShortDescription(nrOfPositiveExamples, nrOfNegativeExamples, baseURI));
1080 logger.debug("tree traversal start accuracy: " + currentAccuracy);
1081 int i = 0;
1082 // start from the most promising nodes
1083 NavigableSet<ExampleBasedNode> reverseView = candidatesStable.descendingSet();
1084 for (ExampleBasedNode currNode : reverseView) {
1085 // compute covered positives and negatives
1086 SortedSet<Individual> newCoveredPositives = new TreeSet<Individual>(currentCoveredPos);
1087 newCoveredPositives.retainAll(currNode.getCoveredPositives());
1088 SortedSet<Individual> newCoveredNegatives = new TreeSet<Individual>(currentCoveredNeg);
1089 newCoveredNegatives.retainAll(currNode.getCoveredNegatives());
1090
1091 // compute the accuracy we would get by adding this node
1092 double accuracy = (newCoveredPositives.size() + nrOfNegativeExamples - newCoveredNegatives
1093 .size())
1094 / (double) (nrOfPositiveExamples + nrOfNegativeExamples);
1095 int misclassifications = nrOfPositiveExamples - newCoveredPositives.size()
1096 + newCoveredNegatives.size();
1097 int misclassifiedPositives = nrOfPositiveExamples - newCoveredPositives.size();
1098
1099 int lostPositives = currentCoveredPos.size() - newCoveredPositives.size();
1100
1101 // TODO: maybe we should also consider a minimum improvement when
1102 // adding something
1103 // otherwise we could overfit
1104 // we give double weith to lost positives, i.e. when one positive is
1105 // lost at least
1106 // two negatives need to be uncovered
1107 boolean consider = (misclassifications + lostPositives < currentMisclassifications)
1108 && (misclassifiedPositives <= allowedMisclassifications);
1109 // boolean consider = (misclassifications <
1110 // currentMisclassifications) &&
1111 // (misclassifiedPositives <= allowedMisclassifications);
1112
1113 // concept has been chosen, so construct it
1114 if (consider) {
1115
1116 // construct a new concept as intersection of both
1117 Intersection mc = new Intersection(currentDescription, currNode.getConcept());
1118
1119 ConceptTransformation.cleanConceptNonRecursive(mc);
1120 ConceptTransformation.transformToOrderedNegationNormalFormNonRecursive(mc,
1121 conceptComparator);
1122
1123 // System.out.println("extended concept to: " + mc);
1124 logger.debug("misclassifications: " + misclassifications);
1125 logger.debug("misclassified positives: " + misclassifiedPositives);
1126 logger.debug("accuracy: " + accuracy);
1127
1128 // update variables
1129 currentDescription = mc;
1130 currentCoveredPos = newCoveredPositives;
1131 currentCoveredNeg = newCoveredNegatives;
1132 currentMisclassifications = misclassifications;
1133 currentAccuracy = accuracy;
1134
1135 if (accuracy > 1 - noise) {
1136 logger.info("traversal found " + mc);
1137 logger.info("accuracy: " + accuracy);
1138 System.exit(0);
1139 }
1140 }
1141
1142 i++;
1143 if (i == 1000)
1144 break;
1145 }
1146
1147 }
1148
1149 // we look for a node covering many positives and hopefully
1150 // few negatives; we give a strong penalty on uncovered positives
1151 private ExampleBasedNode findBestTraversalStartNode() {
1152 // 2 points for each covered pos + 1 point for each uncovered neg
1153 int currScore = 0;
1154 int i = 0;
1155 ExampleBasedNode currNode = null;
1156 NavigableSet<ExampleBasedNode> reverseView = candidatesStable.descendingSet();
1157 for (ExampleBasedNode node : reverseView) {
1158 int score = 2 * node.getCoveredPositives().size()
1159 + (nrOfNegativeExamples - node.getCoveredNegatives().size());
1160 if (score > currScore) {
1161 currScore = score;
1162 currNode = node;
1163 }
1164 i++;
1165 // limit search because stable candidate set can grow very large
1166 if (i == 10000)
1167 break;
1168 }
1169 return currNode;
1170 }
1171
1172 private void reduceCandidates() {
1173 Iterator<ExampleBasedNode> it = candidatesStable.descendingIterator();
1174 Set<ExampleBasedNode> promisingNodes = new HashSet<ExampleBasedNode>();
1175 int i = 0;
1176 while (it.hasNext() && promisingNodes.size() < candidatePostReductionSize) {
1177 ExampleBasedNode node = it.next();
1178 // System.out.println(node.getShortDescription(nrOfPositiveExamples,
1179 // nrOfNegativeExamples, baseURI));
1180 // first criterion: the considered node should have an accuracy gain
1181 // over its parent
1182 // (avoids to use only the most promising node + all its refinements
1183 // with equal accuracy)
1184 boolean hasAccuracyGain = (node.getParent() == null)
1185 || (node.getCoveredPositives().size() != node.getParent().getCoveredPositives()
1186 .size())
1187 || (node.getCoveredNegatives().size() != node.getParent().getCoveredNegatives()
1188 .size());
1189 // second criterion: uncovered positives; it does not make much
1190 // sense to pick nodes with
1191 // low potential for reaching a solution (already at the limit of
1192 // misclassified positives)
1193 int misclassifiedPositives = nrOfPositiveExamples - node.getCoveredPositives().size();
1194 boolean hasRefinementPotential = (misclassifiedPositives <= Math
1195 .floor(0.65d * allowedMisclassifications));
1196 boolean keep = hasAccuracyGain && hasRefinementPotential;
1197 if (keep) {
1198 promisingNodes.add(node);
1199 }
1200 i++;
1201 }
1202 candidates.retainAll(promisingNodes);
1203 logger.debug("searched " + i + " nodes and picked the following promising descriptions:");
1204 for (ExampleBasedNode node : promisingNodes)
1205 logger.debug(node.getShortDescription(nrOfPositiveExamples, nrOfNegativeExamples,
1206 baseURI));
1207 }
1208
1209 /*
1210 * private Set<Individual> computeQuality(Description refinement, Set<Individual>
1211 * coveredPositives) { Set<Individual> ret = new TreeSet<Individual>();
1212 * int misclassifications; for(Individual i : coveredPositives) { boolean
1213 * covered = rs.instanceCheck(refinement, i); if(!covered)
1214 * misclassifications++; else ret.add(i);
1215 *
1216 * if(misclassifications > allowedMisclassifications) return null; } }
1217 */
1218
1219 public void stop() {
1220 stop = true;
1221 }
1222
1223 public Description getBestSolution() {
1224 return candidatesStable.last().getConcept();
1225 }
1226
1227 public List<Description> getCurrentlyBestDescriptions() {
1228 List<Description> best = new LinkedList<Description>();
1229 int i = 0;
1230 int nrOfSolutions = 200;
1231 for (ExampleBasedNode n : candidatesStable.descendingSet()) {
1232 best.add(n.getConcept());
1233 if (i == nrOfSolutions)
1234 return best;
1235 i++;
1236 }
1237 return best;
1238 }
1239
1240 public SortedSet<EvaluatedDescriptionPosNeg> getCurrentlyBestEvaluatedDescriptions() {
1241 Iterator<ExampleBasedNode> it = candidatesStable.descendingIterator();
1242 int count = 0;
1243 SortedSet<EvaluatedDescriptionPosNeg> cbd = new TreeSet<EvaluatedDescriptionPosNeg>(edComparator);
1244 while(it.hasNext()) {
1245 ExampleBasedNode eb = it.next();
1246 cbd.add(new EvaluatedDescriptionPosNeg(eb.getConcept(), getScore(eb.getConcept())));
1247 // return a maximum of 200 elements (we need a maximum, because the
1248 // candidate set can be very large)
1249 if (count > 200)
1250 return cbd;
1251 count++;
1252 }
1253 return cbd;
1254 }
1255
1256 public void printBestSolutions(int nrOfSolutions, boolean showOrderedSolutions) {
1257 // QUALITY: could be optimized
1258 if (!logger.isTraceEnabled())
1259 return;
1260 // if(!logger.getLevel().toString().equalsIgnoreCase("TRACE"))return;
1261 if (nrOfSolutions == 0)
1262 nrOfSolutions = candidatesStable.size();
1263 int i = 0;
1264 for (ExampleBasedNode n : candidatesStable.descendingSet()) {
1265 if (n.getAccuracy(nrOfPositiveExamples, nrOfNegativeExamples) < 1)
1266 break;
1267 logger.trace("best: "
1268 + n.getShortDescription(nrOfPositiveExamples, nrOfNegativeExamples, baseURI));
1269 if (i == nrOfSolutions)
1270 break;
1271 i++;
1272 }
1273
1274 if (showOrderedSolutions) {
1275 logger.trace("ordered by generality (most special solutions first):");
1276 SubsumptionComparator sc = new SubsumptionComparator(rs);
1277 TreeSet<Description> solutionsOrderedBySubsumption = new TreeSet<Description>(sc);
1278 // solutionsOrderedBySubsumption.addAll(solutions);
1279 for (Description d : solutionsOrderedBySubsumption)
1280 logger.trace("special: " + d);
1281 throw new Error("implementation needs to be updated to show ordered solutions");
1282 }
1283 /*
1284 * for (int j = 0; j < solutions.size(); j++) { Description d =
1285 * solutions.get(j); logger.trace(d.toString()); }
1286 */
1287
1288 }
1289
1290 public ScorePosNeg getSolutionScore() {
1291 return (ScorePosNeg) learningProblem.computeScore(getBestSolution());
1292 }
1293
1294 private ScorePosNeg getScore(Description d) {
1295 return (ScorePosNeg) learningProblem.computeScore(d);
1296 }
1297
1298 public ExampleBasedNode getStartNode() {
1299 return startNode;
1300 }
1301
1302 // returns true if there is any meaningful node in the subtree
1303 private boolean checkSubtreePosOnly(ExampleBasedNode node) {
1304 for (ExampleBasedNode refinement : node.getChildren()) {
1305
1306 if (!node.isTooWeak()) {
1307 // refinement meaningful
1308 if (isPosOnlyRefinementMeaningful(node, refinement))
1309 return true;
1310
1311 // subtree with refinement as root contains a meaningful node
1312 if (checkSubtreePosOnly(refinement))
1313 return true;
1314 }
1315
1316 }
1317 return false;
1318 }
1319
1320 // returns whether the refinement is "meaningful", i.e. the refinement
1321 // actually represents a different concept
1322 // than its parent; this is needed to determine when a positive only
1323 // learning algorithm should stop (when a node
1324 // has been expaned x times without yielding any meaningful refinements, it
1325 // is considered a possible solution)
1326 private boolean isPosOnlyRefinementMeaningful(ExampleBasedNode node, ExampleBasedNode refinement) {
1327 Description d1 = node.getConcept();
1328 Description d2 = refinement.getConcept();
1329 // check whether d2 can be shortened, e.g. male AND male => male
1330 Description shortConcept = ConceptTransformation.getShortConcept(d2, conceptComparator);
1331 if (conceptComparator.compare(d1, shortConcept) != 0)
1332 return false;
1333 return true;
1334 }
1335
1336
1337
1338 /**
1339 * In this function it is calculated if the algorithm should stop.
1340 * This is not always depends whether an actual solution was found
1341 * The algorithm stops if:
1342 * 1. the object attribute stop is set to true (possibly by an outside source)
1343 * 2. the maximimum execution time is reached
1344 * 3. the maximum number of class description tests is reached
1345 *
1346 * Continuation criteria and result improvement
1347 * The algorithm continues (although it would normally stop) if
1348 * 1. Minimum execution time is not reached (default 0)
1349 * 2. not enough good solutions are found (default 1)
1350 * otherwise it stops
1351 *
1352 * @return true if the algorithm should stop, this is mostly indepent of the question if a solution was found
1353 */
1354 private boolean isTerminationCriteriaReached(){
1355 if(this.stop){
1356 return true;
1357 }
1358 long totalTimeNeeded = System.currentTimeMillis() - this.runtime;
1359 long maxMilliSeconds = maxExecutionTimeInSeconds * 1000;
1360 long minMilliSeconds = minExecutionTimeInSeconds * 1000;
1361 int conceptTests = conceptTestsReasoner + conceptTestsTooWeakList + conceptTestsOverlyGeneralList;
1362 boolean result = false;
1363
1364 //ignore default
1365 if (maxExecutionTimeInSeconds == 0)
1366 result = false;
1367 //alreadyReached
1368 else if (maxExecutionTimeAlreadyReached)
1369 return true;
1370 //test
1371 else if (maxMilliSeconds < totalTimeNeeded) {
1372 this.stop();
1373 logger.info("Maximum time (" + maxExecutionTimeInSeconds
1374 + " seconds) reached, stopping now...");
1375 maxExecutionTimeAlreadyReached = true;
1376 return true;
1377 }
1378
1379 //ignore default
1380 if(maxClassDescriptionTests == 0)
1381 result = false;
1382 //test
1383 else if(conceptTests >= maxClassDescriptionTests){
1384 logger.info("Maximum Class Description tests (" + maxClassDescriptionTests
1385 + " tests [actual: "+conceptTests+"]) reached, stopping now...");
1386 return true;
1387 }
1388
1389
1390 if (guaranteeXgoodAlreadyReached){
1391 result = true;
1392 } else if(solutions.size() >= guaranteeXgoodDescriptions) {
1393 if(guaranteeXgoodDescriptions != 1) {
1394 logger.info("Minimum number (" + guaranteeXgoodDescriptions
1395 + ") of good descriptions reached.");
1396 }
1397 guaranteeXgoodAlreadyReached = true;
1398 result = true;
1399 }
1400
1401
1402 if (minExecutionTimeAlreadyReached){
1403 result = result && true;
1404 }else if(minMilliSeconds < totalTimeNeeded) {
1405 if(minExecutionTimeInSeconds != 0) {
1406 logger.info("Minimum time (" + minExecutionTimeInSeconds
1407 + " seconds) reached.");
1408 }
1409 minExecutionTimeAlreadyReached = true;
1410 result = result && true;
1411 }
1412
1413 return result;
1414
1415 }
1416
1417 public boolean isRunning() {
1418 return isRunning;
1419 }
1420
1421 }