001    /**
002     * Copyright (C) 2007-2011, Jens Lehmann
003     *
004     * This file is part of DL-Learner.
005     *
006     * DL-Learner is free software; you can redistribute it and/or modify
007     * it under the terms of the GNU General Public License as published by
008     * the Free Software Foundation; either version 3 of the License, or
009     * (at your option) any later version.
010     *
011     * DL-Learner is distributed in the hope that it will be useful,
012     * but WITHOUT ANY WARRANTY; without even the implied warranty of
013     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
014     * GNU General Public License for more details.
015     *
016     * You should have received a copy of the GNU General Public License
017     * along with this program.  If not, see <http://www.gnu.org/licenses/>.
018     */
019    
020    package org.dllearner.learningproblems;
021    
022    import java.util.Collection;
023    import java.util.Collections;
024    import java.util.LinkedList;
025    import java.util.List;
026    import java.util.Random;
027    import java.util.Set;
028    import java.util.SortedSet;
029    import java.util.TreeSet;
030    
031    import org.dllearner.core.AbstractLearningProblem;
032    import org.dllearner.core.AbstractReasonerComponent;
033    import org.dllearner.core.configurators.PosOnlyLPConfigurator;
034    import org.dllearner.core.options.CommonConfigMappings;
035    import org.dllearner.core.options.ConfigEntry;
036    import org.dllearner.core.options.ConfigOption;
037    import org.dllearner.core.options.InvalidConfigOptionValueException;
038    import org.dllearner.core.options.StringSetConfigOption;
039    import org.dllearner.core.owl.Description;
040    import org.dllearner.core.owl.Individual;
041    
042    /**
043     * A learning problem, where we learn from positive examples only.
044     * 
045     * @author Jens Lehmann
046     *
047     */
048    public class PosOnlyLP extends AbstractLearningProblem {
049    
050            protected SortedSet<Individual> positiveExamples;
051            private List<Individual> positiveExamplesShuffled;
052    //      protected SortedSet<Individual> pseudoNegatives;
053            private List<Individual> individuals;
054    
055            // approximation of accuracy +- 0.03 %
056            private static final double approx = 0.03;      
057            
058    //      private PosNegLPStandard definitionLP;
059            private PosOnlyLPConfigurator configurator;
060            
061            @Override
062            public PosOnlyLPConfigurator getConfigurator(){
063                    return configurator;
064            }       
065            
066            public PosOnlyLP() {
067                    super(null);
068                    configurator = new PosOnlyLPConfigurator(this);
069            }
070            
071            public PosOnlyLP(AbstractReasonerComponent reasoningService) {
072                    super(reasoningService);
073                    configurator = new PosOnlyLPConfigurator(this);
074            }
075    
076            /*
077             * (non-Javadoc)
078             * 
079             * @see org.dllearner.core.Component#applyConfigEntry(org.dllearner.core.ConfigEntry)
080             */
081            @Override
082            @SuppressWarnings( { "unchecked" })
083            public <T> void applyConfigEntry(ConfigEntry<T> entry) throws InvalidConfigOptionValueException {
084                    String name = entry.getOptionName();
085                    if (name.equals("positiveExamples"))
086                            positiveExamples = CommonConfigMappings
087                                            .getIndividualSet((Set<String>) entry.getValue());
088            }
089            
090            public static Collection<ConfigOption<?>> createConfigOptions() {
091                    Collection<ConfigOption<?>> options = new LinkedList<ConfigOption<?>>();
092                    options.add(new StringSetConfigOption("positiveExamples",
093                                    "positive examples", null, true, false));
094                    return options;
095            }               
096            
097            public static String getName() {
098                    return "pos only learning problem";
099            }
100            
101            @Override
102            public void init() {
103                    
104                    // old init code, where pos only was realised through pseudo negatives
105                    
106                    // by default we test all other instances of the knowledge base
107    //              pseudoNegatives = Helper.difference(reasoner.getIndividuals(), positiveExamples);
108                    
109                    // create an instance of a standard definition learning problem
110                    // instanciated with pseudo-negatives
111    //              definitionLP = ComponentFactory.getPosNegLPStandard(
112    //                              reasoner, 
113    //                              SetManipulation.indToString(positiveExamples), 
114    //                              SetManipulation.indToString(pseudoNegatives));
115                    //definitionLP = new PosNegDefinitionLP(reasoningService, positiveExamples, pseudoNegatives);
116                    // TODO: we must make sure that the problem also gets the same 
117                    // reasoning options (i.e. options are the same up to reversed example sets)
118    //              definitionLP.init();
119                    
120                    individuals = new LinkedList<Individual>(reasoner.getIndividuals());
121                    positiveExamplesShuffled = new LinkedList<Individual>(positiveExamples);
122                    Random rand = new Random(1);
123                    Collections.shuffle(individuals, rand);
124                    Collections.shuffle(positiveExamplesShuffled, rand);
125            }
126            
127            public SortedSet<Individual> getPositiveExamples() {
128                    return positiveExamples;
129            }       
130            
131            /**
132             * @return the pseudoNegatives
133             */
134    //      public SortedSet<Individual> getPseudoNegatives() {
135    //              return pseudoNegatives;
136    //      }       
137            
138    
139    //      public int coveredPseudoNegativeExamplesOrTooWeak(Description concept) {
140    //              return definitionLP.coveredNegativeExamplesOrTooWeak(concept);
141    //      }
142    
143            /* (non-Javadoc)
144             * @see org.dllearner.core.LearningProblem#computeScore(org.dllearner.core.owl.Description)
145             */
146            @Override
147            public ScorePosOnly computeScore(Description description) {
148                    Set<Individual> retrieval = reasoner.getIndividuals(description);
149                    
150                    Set<Individual> instancesCovered = new TreeSet<Individual>();
151                    Set<Individual> instancesNotCovered = new TreeSet<Individual>();
152                    for(Individual ind : positiveExamples) {
153                            if(retrieval.contains(ind)) {
154                                    instancesCovered.add(ind);
155                            } else {
156                                    instancesNotCovered.add(ind);
157                            }
158                    }
159                    
160                    double coverage = instancesCovered.size()/(double)positiveExamples.size();
161                    double protusion = retrieval.size() == 0 ? 0 : instancesCovered.size()/(double)retrieval.size();        
162                    
163                    // pass only additional instances to score object
164                    retrieval.removeAll(instancesCovered);
165                    return new ScorePosOnly(instancesCovered, instancesNotCovered, coverage, retrieval, protusion, getAccuracy(coverage, protusion));               
166            }
167    
168            /* (non-Javadoc)
169             * @see org.dllearner.core.LearningProblem#evaluate(org.dllearner.core.owl.Description)
170             */
171            @Override
172            public EvaluatedDescriptionPosOnly evaluate(Description description) {
173                    ScorePosOnly score = computeScore(description);
174                    return new EvaluatedDescriptionPosOnly(description, score);             
175            }
176    
177            /* (non-Javadoc)
178             * @see org.dllearner.core.LearningProblem#getAccuracy(org.dllearner.core.owl.Description)
179             */
180            @Override
181            public double getAccuracy(Description description) {
182                    Set<Individual> retrieval = reasoner.getIndividuals(description);
183                    
184                    int instancesCovered = 0;
185                    for(Individual ind : positiveExamples) {
186                            if(retrieval.contains(ind)) {
187                                    instancesCovered++;
188                            }
189                    }
190                    
191                    double coverage = instancesCovered/(double)positiveExamples.size();
192                    double protusion = retrieval.size() == 0 ? 0 : instancesCovered/(double)retrieval.size();       
193                    
194                    return getAccuracy(coverage, protusion);                
195            }
196    
197            /* (non-Javadoc)
198             * @see org.dllearner.core.LearningProblem#getAccuracyOrTooWeak(org.dllearner.core.owl.Description, double)
199             */
200            @Override
201            public double getAccuracyOrTooWeak(Description description, double noise) {
202                    
203                    // instead of using the standard operation, we use optimisation
204                    // and approximation here
205                    
206                    // we abort when there are too many uncovered positives
207                    int maxNotCovered = (int) Math.ceil(noise*positiveExamples.size());
208                    int instancesCovered = 0;
209                    int instancesNotCovered = 0;
210                    int total = 0;
211                    boolean estimatedA = false;
212                    
213                    double lowerBorderA = 0;
214                    int lowerEstimateA = 0;
215                    double upperBorderA = 1;
216                    int upperEstimateA = positiveExamples.size();
217                    
218                    for(Individual ind : positiveExamplesShuffled) {
219                            if(reasoner.hasType(description, ind)) {
220                                    instancesCovered++;
221                            } else {
222                                    instancesNotCovered ++;
223                                    if(instancesNotCovered > maxNotCovered) {
224                                            return -1;
225                                    }
226                            }
227                            
228                            // approximation step (starting after 10 tests)
229                            total = instancesCovered + instancesNotCovered;
230                            if(total > 10) {
231                                    // compute confidence interval
232                                    double p1 = p1(instancesCovered, total);
233                                    double p2 = p3(p1, total);
234                                    lowerBorderA = Math.max(0, p1 - p2);
235                                    upperBorderA = Math.min(1, p1 + p2);
236                                    double size = upperBorderA - lowerBorderA;
237                                    // if the interval has a size smaller than 10%, we can be confident
238                                    if(size < 2 * approx) {
239                                            // we have to distinguish the cases that the accuracy limit is
240                                            // below, within, or above the limit and that the mean is below
241                                            // or above the limit
242                                            double mean = instancesCovered/(double)total;
243                                            
244                                            // if the mean is greater than the required minimum, we can accept;
245                                            // we also accept if the interval is small and close to the minimum
246                                            // (worst case is to accept a few inaccurate descriptions)
247                                            if(mean > noise || (upperBorderA > mean && size < 0.03)) {
248                                                    instancesCovered = (int) (instancesCovered/(double)total * positiveExamples.size());
249                                                    upperEstimateA = (int) (upperBorderA * positiveExamples.size());
250                                                    lowerEstimateA = (int) (lowerBorderA * positiveExamples.size());
251                                                    estimatedA = true;
252                                                    break;
253                                            }
254                                            
255                                            // reject only if the upper border is far away (we are very
256                                            // certain not to lose a potential solution)
257                                            if(upperBorderA + 0.1 < noise) {
258                                                    return -1;
259                                            }
260                                    }                               
261                            }
262                    }       
263                    
264                    double coverage = instancesCovered/(double)positiveExamples.size();
265                    
266                    int testsPerformed = 0;
267                    int instancesDescription = 0;
268                    
269                    for(Individual ind : individuals) {
270    
271                            if(reasoner.hasType(description, ind)) {
272                                    instancesDescription++;
273                            }
274                            
275                            testsPerformed++;
276                            
277                            if(testsPerformed > 10) {
278                                    
279                                    // compute confidence interval
280                                    double p1 = p1(instancesDescription, testsPerformed);
281                                    double p2 = p3(p1, testsPerformed);
282                                    double lowerBorder = Math.max(0, p1 - p2);
283                                    double upperBorder = Math.min(1, p1 + p2);
284                                    int lowerEstimate = (int) (lowerBorder * individuals.size());
285                                    int upperEstimate = (int) (upperBorder * individuals.size());
286                                    
287                                    double size;
288                                    if(estimatedA) {
289    //                                      size = 1/(coverageFactor+1) * (coverageFactor * (upperBorderA-lowerBorderA) + Math.sqrt(upperEstimateA/(upperEstimateA+lowerEstimate)) + Math.sqrt(lowerEstimateA/(lowerEstimateA+upperEstimate)));
290                                            size = getAccuracy(upperBorderA, upperEstimateA/(double)(upperEstimateA+lowerEstimate)) - getAccuracy(lowerBorderA, lowerEstimateA/(double)(lowerEstimateA+upperEstimate));                                     
291                                    } else {
292    //                                      size = 1/(coverageFactor+1) * (coverageFactor * coverage + Math.sqrt(instancesCovered/(instancesCovered+lowerEstimate)) + Math.sqrt(instancesCovered/(instancesCovered+upperEstimate)));
293                                            size = getAccuracy(coverage, instancesCovered/(double)(instancesCovered+lowerEstimate)) - getAccuracy(coverage, instancesCovered/(double)(instancesCovered+upperEstimate));
294                                    }
295                                    
296                                    if(size < 0.1) {
297    //                                      System.out.println(instancesDescription + " of " + testsPerformed);
298    //                                      System.out.println("interval from " + lowerEstimate + " to " + upperEstimate);
299    //                                      System.out.println("size: " + size);
300                                            
301    //                                      estimatedB = true;
302                                            // calculate total number of instances
303                                            instancesDescription = (int) (instancesDescription/(double)testsPerformed * individuals.size());
304                                            break;
305                                    }
306                            }
307                    }
308                    
309                    // since we measured/estimated accuracy only on instances outside A (superClassInstances
310                    // does not include instances of A), we need to add it in the denominator
311                    double protusion = instancesCovered/(double)(instancesDescription+instancesCovered);
312                    if(instancesCovered + instancesDescription == 0) {
313                            protusion = 0;
314                    }               
315            
316                    return getAccuracy(coverage, protusion);                
317            }
318            
319            // see paper: expression used in confidence interval estimation
320            private static double p3(double p1, int total) {
321                    return 1.96 * Math.sqrt(p1*(1-p1)/(total+4));
322            }               
323                    
324            // see paper: p'
325            private static double p1(int success, int total) {
326                    return (success+2)/(double)(total+4);
327            }       
328            
329            private double getAccuracy(double coverage, double protusion) {
330                    return 0.5 * (coverage + Math.sqrt(protusion));
331            }       
332    }