001 /**
002 * Copyright (C) 2007-2008, Jens Lehmann
003 *
004 * This file is part of DL-Learner.
005 *
006 * DL-Learner is free software; you can redistribute it and/or modify
007 * it under the terms of the GNU General Public License as published by
008 * the Free Software Foundation; either version 3 of the License, or
009 * (at your option) any later version.
010 *
011 * DL-Learner is distributed in the hope that it will be useful,
012 * but WITHOUT ANY WARRANTY; without even the implied warranty of
013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
014 * GNU General Public License for more details.
015 *
016 * You should have received a copy of the GNU General Public License
017 * along with this program. If not, see <http://www.gnu.org/licenses/>.
018 *
019 */
020 package org.dllearner.scripts.improveWikipedia;
021
022 import java.util.ArrayList;
023 import java.util.List;
024 import java.util.SortedSet;
025 import java.util.TreeSet;
026
027 import org.apache.log4j.Logger;
028 import org.dllearner.core.owl.Individual;
029 import org.dllearner.kb.sparql.SPARQLTasks;
030 import org.dllearner.learningproblems.EvaluatedDescriptionPosNeg;
031 import org.dllearner.utilities.Helper;
032 import org.dllearner.utilities.owl.EvaluatedDescriptionPosNegComparator;
033
034 /**
035 * @author Sebastian Hellmann
036 *
037 * The EvaluatedDescriptions from a fragment are validated against the
038 * SPARQLendpoint. There are different strategies, see the methods;
039 */
040 public class ConceptSPARQLReEvaluator {
041
042 private static Logger logger = Logger
043 .getLogger(ConceptSPARQLReEvaluator.class);
044
045 List<EvaluatedDescriptionPosNeg> descToBeReevaluated;
046
047 SPARQLTasks sparqlTasks;
048
049 int sparqlResultLimit = 1000;
050
051 int depthOfRDFS = 1;
052
053 /**
054 * Constructor using default settings
055 *
056 * @param sparqlTasks
057 */
058 public ConceptSPARQLReEvaluator(SPARQLTasks sparqlTasks) {
059 this.sparqlTasks = sparqlTasks;
060 }
061
062 /**
063 * constructor to manually set parameters
064 *
065 * @param sparqlTasks
066 * @param depthOfRDFS
067 * @param sparqlResultLimit
068 */
069 public ConceptSPARQLReEvaluator(SPARQLTasks sparqlTasks, int depthOfRDFS,
070 int sparqlResultLimit) {
071 this(sparqlTasks);
072 this.depthOfRDFS = depthOfRDFS;
073 this.sparqlResultLimit = sparqlResultLimit;
074 }
075
076 /**
077 * Accuracy is calculated as correct positive classified over (correct
078 * positive classified + incorrect negative classified) "How many are
079 * correctly positive classified?" e.g. 50 individuals of a 60-individual
080 * Category (50/60)
081 *
082 * @param positiveSet
083 */
084 public List<EvaluatedDescriptionPosNeg> reevaluateConceptsByDataCoverage(
085 List<EvaluatedDescriptionPosNeg> descToBeReevaluated,
086 SortedSet<String> positiveSet) {
087
088 SortedSet<EvaluatedDescriptionPosNeg> returnSet = new TreeSet<EvaluatedDescriptionPosNeg>(
089 new EvaluatedDescriptionPosNegComparator());
090
091 SortedSet<String> instances = new TreeSet<String>();
092 SortedSet<String> PosAsPos = new TreeSet<String>();
093 SortedSet<String> PosAsNeg = new TreeSet<String>();
094
095 // NegAsPos doesnt exist, because they are supposed to be possible
096 // candidates
097 SortedSet<Individual> NegAsPos = new TreeSet<Individual>();
098 // NegAsNeg doesnt exist, because all
099 SortedSet<Individual> NegAsNeg = new TreeSet<Individual>();
100
101 // elements are immediately removed from the list to save memory
102 while (!descToBeReevaluated.isEmpty()) {
103 EvaluatedDescriptionPosNeg ed = descToBeReevaluated.remove(0);
104 try {
105 instances = retrieveInstances(ed);
106
107 // PosAsPos
108 PosAsPos.addAll(positiveSet);
109 PosAsPos.retainAll(instances);
110
111 // PosAsNeg
112 PosAsNeg.addAll(positiveSet);
113 PosAsNeg.removeAll(PosAsPos);
114
115 EvaluatedDescriptionPosNeg d = new EvaluatedDescriptionPosNeg(ed.getDescription(), Helper
116 .getIndividualSet(PosAsPos), Helper
117 .getIndividualSet(PosAsNeg), NegAsPos, NegAsNeg);
118
119 if(d.getAccuracy()<0.1 || d.getNotCoveredPositives().isEmpty()){
120
121 }else{
122 returnSet.add(d);
123 }
124
125
126
127
128 }catch(Exception e){
129 logger.warn("ERROR occured, while evaluating, I'm ignoring it : "+e.toString());
130 logger.warn("Concept was: "+ed.getDescription().toKBSyntaxString());
131 }finally{
132 PosAsPos.clear();
133 PosAsNeg.clear();
134 }
135
136 }
137
138 return new ArrayList<EvaluatedDescriptionPosNeg>(returnSet);
139
140 }
141
142 /**
143 * Accuracy is calculated as correct positive classified over all retrieved
144 * e.g. 50 correct out of 400 retrieved (50/400)
145 *
146 * @param positiveSet
147 */
148 public List<EvaluatedDescriptionPosNeg> reevaluateConceptsByLowestRecall(
149 List<EvaluatedDescriptionPosNeg> descToBeReevaluated,
150 SortedSet<String> positiveSet) {
151 logger.info("reevaluating by lowest recall "
152 + descToBeReevaluated.size() + " concepts");
153 SortedSet<EvaluatedDescriptionPosNeg> returnSet = new TreeSet<EvaluatedDescriptionPosNeg>(
154 new EvaluatedDescriptionPosNegComparator());
155
156 SortedSet<String> instances = new TreeSet<String>();
157
158 SortedSet<String> PosAsPos = new TreeSet<String>();
159 SortedSet<String> PosAsNeg = new TreeSet<String>();
160
161 SortedSet<Individual> NegAsPos = new TreeSet<Individual>();
162 SortedSet<Individual> NegAsNeg = new TreeSet<Individual>();
163
164 // elements are immediately removed from the list to save memory
165 while (!descToBeReevaluated.isEmpty()) {
166 EvaluatedDescriptionPosNeg ed = descToBeReevaluated.remove(0);
167 try {
168 instances = retrieveInstances(ed);
169
170 // PosAsPos
171 PosAsPos.addAll(positiveSet);
172 PosAsPos.retainAll(instances);
173
174 // PosAsNeg
175 PosAsNeg.addAll(instances);
176 PosAsNeg.removeAll(PosAsPos);
177
178 EvaluatedDescriptionPosNeg d = new EvaluatedDescriptionPosNeg(ed.getDescription(), Helper
179 .getIndividualSet(PosAsPos), Helper
180 .getIndividualSet(PosAsNeg), NegAsPos, NegAsNeg);
181
182
183 if(d.getNotCoveredPositives().isEmpty()){
184
185 }else{
186 returnSet.add(d);
187 }
188
189
190 }catch(Exception e){
191 logger.warn("ERROR occured, while evaluating, I'm ignoring it :"+e.toString());
192 logger.warn("Concept was: "+ed.getDescription().toKBSyntaxString());
193 }finally{
194 PosAsPos.clear();
195 PosAsNeg.clear();
196 }
197 }
198 logger.info("finished reevaluating by lowest recall :"
199 + returnSet.size() + " concepts");
200 return new ArrayList<EvaluatedDescriptionPosNeg>(returnSet);
201
202 }
203
204 private SortedSet<String> retrieveInstances(EvaluatedDescriptionPosNeg ed) {
205 String kbsyntax = ed.getDescription().toKBSyntaxString();
206 return sparqlTasks
207 .retrieveInstancesForClassDescriptionIncludingSubclasses(
208 kbsyntax, sparqlResultLimit, depthOfRDFS);
209 }
210
211 /*
212 * public List<EvaluatedDescription> reevaluateConceptsByLowestRecall( List<EvaluatedDescription>
213 * descToBeReevaluated, SortedSet<String> positiveSet, int maxNrOfConcepts) {
214 * List<EvaluatedDescription> tmp =
215 * reevaluateConceptsByLowestRecall(descToBeReevaluated, positiveSet); List<EvaluatedDescription>
216 * returnSet = new ArrayList<EvaluatedDescription>();
217 *
218 * while ((!tmp.isEmpty()) && (returnSet.size() <= maxNrOfConcepts)) {
219 * returnSet.add(tmp.remove(0)); }
220 *
221 * return returnSet; }
222 */
223
224 /*
225 * public List<EvaluatedDescription> reevaluateConceptsByDataCoverage( List<EvaluatedDescription>
226 * descToBeReevaluated, SortedSet<String> positiveSet, int maxNrOfConcepts) {
227 * List<EvaluatedDescription> tmp =
228 * reevaluateConceptsByLowestRecall(descToBeReevaluated, positiveSet); List<EvaluatedDescription>
229 * returnSet = new ArrayList<EvaluatedDescription>();
230 *
231 * while ((!tmp.isEmpty()) && (returnSet.size() <= maxNrOfConcepts)) {
232 * returnSet.add(tmp.remove(0)); }
233 *
234 * return returnSet; }
235 */
236
237 }