001 /**
002 * Copyright (C) 2007-2008, Jens Lehmann
003 *
004 * This file is part of DL-Learner.
005 *
006 * DL-Learner is free software; you can redistribute it and/or modify
007 * it under the terms of the GNU General Public License as published by
008 * the Free Software Foundation; either version 3 of the License, or
009 * (at your option) any later version.
010 *
011 * DL-Learner is distributed in the hope that it will be useful,
012 * but WITHOUT ANY WARRANTY; without even the implied warranty of
013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
014 * GNU General Public License for more details.
015 *
016 * You should have received a copy of the GNU General Public License
017 * along with this program. If not, see <http://www.gnu.org/licenses/>.
018 *
019 */
020 package org.dllearner.scripts.improveWikipedia;
021
022 import java.util.List;
023 import java.util.SortedSet;
024 import java.util.TreeSet;
025
026 import org.apache.log4j.Logger;
027 import org.dllearner.kb.sparql.SPARQLTasks;
028 import org.dllearner.learningproblems.EvaluatedDescriptionPosNeg;
029 import org.dllearner.utilities.Helper;
030 import org.dllearner.utilities.datastructures.SetManipulation;
031 import org.dllearner.utilities.examples.AutomaticNegativeExampleFinderSPARQL;
032 import org.dllearner.utilities.examples.AutomaticPositiveExampleFinderSPARQL;
033
034 public class WikipediaCategoryTasks {
035
036 private static Logger logger = Logger
037 .getLogger(WikipediaCategoryTasks.class);
038
039 private SPARQLTasks sparqlTasks;
040
041 // these cahnge all the time
042 private SortedSet<String> posExamples = new TreeSet<String>();
043
044 private SortedSet<String> negExamples = new TreeSet<String>();
045
046 // these dont change, they are for collecting
047 private SortedSet<String> cleanedPositiveSet = new TreeSet<String>();
048
049 private SortedSet<String> fullPositiveSet = new TreeSet<String>();
050
051 private SortedSet<String> definitelyWrongIndividuals = new TreeSet<String>();
052
053 public WikipediaCategoryTasks(SPARQLTasks sparqlTasks) {
054 this.sparqlTasks = sparqlTasks;
055 }
056
057 /**
058 * The strategy is yet really simple. //TODO take the best concept and the
059 * notCoveredPositives are the ones definitely wrong these are removed from
060 * the positives examples.
061 *
062 * @param conceptresults
063 * @param posExamples
064 */
065 public SortedSet<String> calculateWrongIndividualsAndNewPosEx(
066 List<EvaluatedDescriptionPosNeg> conceptresults,
067 SortedSet<String> posExamples) {
068
069 definitelyWrongIndividuals.clear();
070 definitelyWrongIndividuals.addAll(Helper.getStringSet(conceptresults.get(0)
071 .getNotCoveredPositives()));
072
073 // clean the examples
074 posExamples.removeAll(definitelyWrongIndividuals);
075 this.posExamples.clear();
076 this.posExamples.addAll(posExamples);
077 this.cleanedPositiveSet.addAll(posExamples);
078 // fullPosSetWithoutPosExamples.removeAll(definitelyWrongIndividuals);
079
080 logger.trace("posExamples" + posExamples.size());
081 logger.trace("fullPositives" + fullPositiveSet.size());
082
083 negExamples.clear();
084
085 return definitelyWrongIndividuals;
086
087 }
088
089 /**
090 * TODO could be more sophisticated
091 *
092 * @param reEvaluatedDesc
093 */
094 public SortedSet<String> makeNewNegativeExamples(
095 List<EvaluatedDescriptionPosNeg> reEvaluatedDesc,
096 SortedSet<String> posExamples, double negFactor) {
097 negExamples.clear();
098
099 EvaluatedDescriptionPosNeg newDesc = reEvaluatedDesc.get(0);
100 logger.info("Best concept: " + newDesc.getDescription());
101
102 negExamples.addAll(Helper.getStringSet(newDesc.getCoveredPositives()));
103 negExamples.addAll(Helper
104 .getStringSet(newDesc.getNotCoveredPositives()));
105 negExamples.addAll(Helper.getStringSet(newDesc.getCoveredNegatives()));
106 negExamples.addAll(Helper
107 .getStringSet(newDesc.getNotCoveredNegatives()));
108
109 negExamples.removeAll(posExamples);
110
111 int neglimit = (int) Math.round(posExamples.size() * negFactor);
112 negExamples = SetManipulation.fuzzyShrink(negExamples, neglimit);
113
114 return negExamples;
115 }
116
117 /**
118 * makes positive and negative Examples. positives are a simple retrieval of
119 * the category. negatives are made from parallelclasses.
120 *
121 * @param targetCategory
122 * @param percentOfSKOSSet
123 * percentage used from the SKOSSet for training
124 * @param negFactor
125 * size of the negative Examples compared to the posExample size
126 * (1.0 means equal size)
127 * @param sparqlResultLimit
128 */
129 public void makeInitialExamples(String targetCategory,
130 double percentOfSKOSSet, double negFactor, int sparqlResultLimitNegativeExamples,
131 boolean stable) {
132 fullPositiveSet.clear();
133 // fullPosSetWithoutPosExamples.clear();
134 posExamples.clear();
135 negExamples.clear();
136
137 // POSITIVES
138 AutomaticPositiveExampleFinderSPARQL apos = new AutomaticPositiveExampleFinderSPARQL(
139 sparqlTasks);
140 apos.makePositiveExamplesFromSKOSConcept(targetCategory);
141 fullPositiveSet.addAll(apos.getPosExamples());
142
143 int poslimit = (int) Math.round(percentOfSKOSSet
144 * fullPositiveSet.size());
145 int neglimit = (int) Math.round(poslimit * negFactor);
146
147 posExamples.addAll(SetManipulation.fuzzyShrink(fullPositiveSet, poslimit));
148
149 // NEGATIVES
150
151 AutomaticNegativeExampleFinderSPARQL aneg = new AutomaticNegativeExampleFinderSPARQL(
152 fullPositiveSet, sparqlTasks, new TreeSet<String>());
153
154 aneg.makeNegativeExamplesFromParallelClasses(posExamples,
155 sparqlResultLimitNegativeExamples);
156 negExamples = aneg.getNegativeExamples(neglimit, stable);
157
158 logger.debug("POSITIVE EXAMPLES");
159 for (String pos : posExamples) {
160 logger.debug("+" + pos);
161 }
162
163 logger.debug("NEGATIVE EXAMPLES");
164 for (String negs : this.negExamples) {
165 logger.debug("-" + negs);
166 }
167
168 // fullPosSetWithoutPosExamples.addAll(fullPositiveSet);
169 // fullPosSetWithoutPosExamples.removeAll(posExamples);
170
171 // logger.debug(fullPositiveSet);
172
173 // logger.debug(fullPosSetWithoutPosExamples);
174
175 }
176
177 public SortedSet<String> getPosExamples() {
178 return posExamples;
179 }
180
181 public SortedSet<String> getNegExamples() {
182 return negExamples;
183 }
184
185 public SortedSet<String> getFullPositiveSet() {
186 return fullPositiveSet;
187 }
188
189 public SortedSet<String> getDefinitelyWrongIndividuals() {
190 return definitelyWrongIndividuals;
191 }
192
193 public SortedSet<String> getCleanedPositiveSet() {
194 return cleanedPositiveSet;
195 }
196
197 }