001    /**
002     * Copyright (C) 2007-2008, Jens Lehmann
003     *
004     * This file is part of DL-Learner.
005     * 
006     * DL-Learner is free software; you can redistribute it and/or modify
007     * it under the terms of the GNU General Public License as published by
008     * the Free Software Foundation; either version 3 of the License, or
009     * (at your option) any later version.
010     *
011     * DL-Learner is distributed in the hope that it will be useful,
012     * but WITHOUT ANY WARRANTY; without even the implied warranty of
013     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
014     * GNU General Public License for more details.
015     *
016     * You should have received a copy of the GNU General Public License
017     * along with this program.  If not, see <http://www.gnu.org/licenses/>.
018     *
019     */
020    package org.dllearner.scripts.improveWikipedia;
021    
022    import java.util.ArrayList;
023    import java.util.List;
024    import java.util.SortedSet;
025    import java.util.TreeSet;
026    
027    import org.apache.log4j.Logger;
028    import org.dllearner.core.owl.Individual;
029    import org.dllearner.kb.sparql.SPARQLTasks;
030    import org.dllearner.learningproblems.EvaluatedDescriptionPosNeg;
031    import org.dllearner.utilities.Helper;
032    import org.dllearner.utilities.owl.EvaluatedDescriptionPosNegComparator;
033    
034    /**
035     * @author Sebastian Hellmann
036     * 
037     * The EvaluatedDescriptions from a fragment are validated against the
038     * SPARQLendpoint. There are different strategies, see the methods;
039     */
040    public class ConceptSPARQLReEvaluator {
041    
042            private static Logger logger = Logger
043                            .getLogger(ConceptSPARQLReEvaluator.class);
044    
045            List<EvaluatedDescriptionPosNeg> descToBeReevaluated;
046    
047            SPARQLTasks sparqlTasks;
048    
049            int sparqlResultLimit = 1000;
050    
051            int depthOfRDFS = 1;
052    
053            /**
054             * Constructor using default settings
055             * 
056             * @param sparqlTasks
057             */
058            public ConceptSPARQLReEvaluator(SPARQLTasks sparqlTasks) {
059                    this.sparqlTasks = sparqlTasks;
060            }
061    
062            /**
063             * constructor to manually set parameters
064             * 
065             * @param sparqlTasks
066             * @param depthOfRDFS
067             * @param sparqlResultLimit
068             */
069            public ConceptSPARQLReEvaluator(SPARQLTasks sparqlTasks, int depthOfRDFS,
070                            int sparqlResultLimit) {
071                    this(sparqlTasks);
072                    this.depthOfRDFS = depthOfRDFS;
073                    this.sparqlResultLimit = sparqlResultLimit;
074            }
075    
076            /**
077             * Accuracy is calculated as correct positive classified over (correct
078             * positive classified + incorrect negative classified) "How many are
079             * correctly positive classified?" e.g. 50 individuals of a 60-individual
080             * Category (50/60)
081             * 
082             * @param positiveSet
083             */
084            public List<EvaluatedDescriptionPosNeg> reevaluateConceptsByDataCoverage(
085                            List<EvaluatedDescriptionPosNeg> descToBeReevaluated,
086                            SortedSet<String> positiveSet) {
087    
088                    SortedSet<EvaluatedDescriptionPosNeg> returnSet = new TreeSet<EvaluatedDescriptionPosNeg>(
089                                    new EvaluatedDescriptionPosNegComparator());
090    
091                    SortedSet<String> instances = new TreeSet<String>();
092                    SortedSet<String> PosAsPos = new TreeSet<String>();
093                    SortedSet<String> PosAsNeg = new TreeSet<String>();
094    
095                    // NegAsPos doesnt exist, because they are supposed to be possible
096                    // candidates
097                    SortedSet<Individual> NegAsPos = new TreeSet<Individual>();
098                    // NegAsNeg doesnt exist, because all
099                    SortedSet<Individual> NegAsNeg = new TreeSet<Individual>();
100    
101                    // elements are immediately removed from the list to save memory
102                    while (!descToBeReevaluated.isEmpty()) {
103                            EvaluatedDescriptionPosNeg ed = descToBeReevaluated.remove(0);
104                            try {
105                            instances = retrieveInstances(ed);
106    
107                            // PosAsPos
108                            PosAsPos.addAll(positiveSet);
109                            PosAsPos.retainAll(instances);
110    
111                            // PosAsNeg
112                            PosAsNeg.addAll(positiveSet);
113                            PosAsNeg.removeAll(PosAsPos);
114                            
115                            EvaluatedDescriptionPosNeg d = new EvaluatedDescriptionPosNeg(ed.getDescription(), Helper
116                                            .getIndividualSet(PosAsPos), Helper
117                                            .getIndividualSet(PosAsNeg), NegAsPos, NegAsNeg);
118                            
119                            if(d.getAccuracy()<0.1 || d.getNotCoveredPositives().isEmpty()){
120                                    
121                            }else{
122                                    returnSet.add(d);
123                            }
124    
125                            
126                    
127    
128                            }catch(Exception e){
129                                    logger.warn("ERROR occured, while evaluating, I'm ignoring it : "+e.toString());
130                                    logger.warn("Concept was: "+ed.getDescription().toKBSyntaxString());
131                            }finally{
132                                    PosAsPos.clear();
133                                    PosAsNeg.clear();
134                            }
135    
136                    }
137    
138                    return new ArrayList<EvaluatedDescriptionPosNeg>(returnSet);
139    
140            }
141    
142            /**
143             * Accuracy is calculated as correct positive classified over all retrieved
144             * e.g. 50 correct out of 400 retrieved (50/400)
145             * 
146             * @param positiveSet
147             */
148            public List<EvaluatedDescriptionPosNeg> reevaluateConceptsByLowestRecall(
149                            List<EvaluatedDescriptionPosNeg> descToBeReevaluated,
150                            SortedSet<String> positiveSet) {
151                    logger.info("reevaluating by lowest recall "
152                                    + descToBeReevaluated.size() + " concepts");
153                    SortedSet<EvaluatedDescriptionPosNeg> returnSet = new TreeSet<EvaluatedDescriptionPosNeg>(
154                                    new EvaluatedDescriptionPosNegComparator());
155    
156                    SortedSet<String> instances = new TreeSet<String>();
157    
158                    SortedSet<String> PosAsPos = new TreeSet<String>();
159                    SortedSet<String> PosAsNeg = new TreeSet<String>();
160    
161                    SortedSet<Individual> NegAsPos = new TreeSet<Individual>();
162                    SortedSet<Individual> NegAsNeg = new TreeSet<Individual>();
163                    
164                    // elements are immediately removed from the list to save memory
165                    while (!descToBeReevaluated.isEmpty()) {
166                            EvaluatedDescriptionPosNeg ed = descToBeReevaluated.remove(0);
167                            try {
168                            instances = retrieveInstances(ed);
169    
170                            // PosAsPos
171                            PosAsPos.addAll(positiveSet);
172                            PosAsPos.retainAll(instances);
173    
174                            // PosAsNeg
175                            PosAsNeg.addAll(instances);
176                            PosAsNeg.removeAll(PosAsPos);
177    
178                            EvaluatedDescriptionPosNeg d = new EvaluatedDescriptionPosNeg(ed.getDescription(), Helper
179                                            .getIndividualSet(PosAsPos), Helper
180                                            .getIndividualSet(PosAsNeg), NegAsPos, NegAsNeg);
181                            
182                            
183                            if(d.getNotCoveredPositives().isEmpty()){
184                                    
185                            }else{
186                                    returnSet.add(d);
187                            }
188    
189                            
190                            }catch(Exception e){
191                                    logger.warn("ERROR occured, while evaluating, I'm ignoring it :"+e.toString());
192                                    logger.warn("Concept was: "+ed.getDescription().toKBSyntaxString());
193                            }finally{
194                                    PosAsPos.clear();
195                                    PosAsNeg.clear();
196                            }
197                    }
198                    logger.info("finished reevaluating by lowest recall :"
199                                    + returnSet.size() + " concepts");
200                    return new ArrayList<EvaluatedDescriptionPosNeg>(returnSet);
201    
202            }
203    
204            private SortedSet<String> retrieveInstances(EvaluatedDescriptionPosNeg ed) {
205                    String kbsyntax = ed.getDescription().toKBSyntaxString();
206                    return sparqlTasks
207                                    .retrieveInstancesForClassDescriptionIncludingSubclasses(
208                                                    kbsyntax, sparqlResultLimit, depthOfRDFS);
209            }
210    
211            /*
212             * public List<EvaluatedDescription> reevaluateConceptsByLowestRecall( List<EvaluatedDescription>
213             * descToBeReevaluated, SortedSet<String> positiveSet, int maxNrOfConcepts) {
214             * List<EvaluatedDescription> tmp =
215             * reevaluateConceptsByLowestRecall(descToBeReevaluated, positiveSet); List<EvaluatedDescription>
216             * returnSet = new ArrayList<EvaluatedDescription>();
217             * 
218             * while ((!tmp.isEmpty()) && (returnSet.size() <= maxNrOfConcepts)) {
219             * returnSet.add(tmp.remove(0)); }
220             * 
221             * return returnSet; }
222             */
223    
224            /*
225             * public List<EvaluatedDescription> reevaluateConceptsByDataCoverage( List<EvaluatedDescription>
226             * descToBeReevaluated, SortedSet<String> positiveSet, int maxNrOfConcepts) {
227             * List<EvaluatedDescription> tmp =
228             * reevaluateConceptsByLowestRecall(descToBeReevaluated, positiveSet); List<EvaluatedDescription>
229             * returnSet = new ArrayList<EvaluatedDescription>();
230             * 
231             * while ((!tmp.isEmpty()) && (returnSet.size() <= maxNrOfConcepts)) {
232             * returnSet.add(tmp.remove(0)); }
233             * 
234             * return returnSet; }
235             */
236    
237    }