001    /**
002     * Copyright (C) 2007-2008, Jens Lehmann
003     *
004     * This file is part of DL-Learner.
005     * 
006     * DL-Learner is free software; you can redistribute it and/or modify
007     * it under the terms of the GNU General Public License as published by
008     * the Free Software Foundation; either version 3 of the License, or
009     * (at your option) any later version.
010     *
011     * DL-Learner is distributed in the hope that it will be useful,
012     * but WITHOUT ANY WARRANTY; without even the implied warranty of
013     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
014     * GNU General Public License for more details.
015     *
016     * You should have received a copy of the GNU General Public License
017     * along with this program.  If not, see <http://www.gnu.org/licenses/>.
018     *
019     */
020    package org.dllearner.scripts.improveWikipedia;
021    
022    import java.util.List;
023    import java.util.SortedSet;
024    import java.util.TreeSet;
025    
026    import org.apache.log4j.Logger;
027    import org.dllearner.kb.sparql.SPARQLTasks;
028    import org.dllearner.learningproblems.EvaluatedDescriptionPosNeg;
029    import org.dllearner.utilities.Helper;
030    import org.dllearner.utilities.datastructures.SetManipulation;
031    import org.dllearner.utilities.examples.AutomaticNegativeExampleFinderSPARQL;
032    import org.dllearner.utilities.examples.AutomaticPositiveExampleFinderSPARQL;
033    
034    public class WikipediaCategoryTasks {
035    
036            private static Logger logger = Logger
037                            .getLogger(WikipediaCategoryTasks.class);
038    
039            private SPARQLTasks sparqlTasks;
040    
041            // these cahnge all the time
042            private SortedSet<String> posExamples = new TreeSet<String>();
043    
044            private SortedSet<String> negExamples = new TreeSet<String>();
045    
046            // these dont change, they are for collecting
047            private SortedSet<String> cleanedPositiveSet = new TreeSet<String>();
048    
049            private SortedSet<String> fullPositiveSet = new TreeSet<String>();
050    
051            private SortedSet<String> definitelyWrongIndividuals = new TreeSet<String>();
052    
053            public WikipediaCategoryTasks(SPARQLTasks sparqlTasks) {
054                    this.sparqlTasks = sparqlTasks;
055            }
056    
057            /**
058             * The strategy is yet really simple. //TODO take the best concept and the
059             * notCoveredPositives are the ones definitely wrong these are removed from
060             * the positives examples.
061             * 
062             * @param conceptresults
063             * @param posExamples
064             */
065            public SortedSet<String> calculateWrongIndividualsAndNewPosEx(
066                            List<EvaluatedDescriptionPosNeg> conceptresults,
067                            SortedSet<String> posExamples) {
068    
069                    definitelyWrongIndividuals.clear();
070                    definitelyWrongIndividuals.addAll(Helper.getStringSet(conceptresults.get(0)
071                                    .getNotCoveredPositives()));
072    
073                    // clean the examples
074                    posExamples.removeAll(definitelyWrongIndividuals);
075                    this.posExamples.clear();
076                    this.posExamples.addAll(posExamples);
077                    this.cleanedPositiveSet.addAll(posExamples);
078                    // fullPosSetWithoutPosExamples.removeAll(definitelyWrongIndividuals);
079    
080                    logger.trace("posExamples" + posExamples.size());
081                    logger.trace("fullPositives" + fullPositiveSet.size());
082    
083                    negExamples.clear();
084    
085                    return definitelyWrongIndividuals;
086    
087            }
088    
089            /**
090             * TODO could be more sophisticated
091             * 
092             * @param reEvaluatedDesc
093             */
094            public SortedSet<String> makeNewNegativeExamples(
095                            List<EvaluatedDescriptionPosNeg> reEvaluatedDesc,
096                            SortedSet<String> posExamples, double negFactor) {
097                    negExamples.clear();
098    
099                    EvaluatedDescriptionPosNeg newDesc = reEvaluatedDesc.get(0);
100                    logger.info("Best concept: " + newDesc.getDescription());
101    
102                    negExamples.addAll(Helper.getStringSet(newDesc.getCoveredPositives()));
103                    negExamples.addAll(Helper
104                                    .getStringSet(newDesc.getNotCoveredPositives()));
105                    negExamples.addAll(Helper.getStringSet(newDesc.getCoveredNegatives()));
106                    negExamples.addAll(Helper
107                                    .getStringSet(newDesc.getNotCoveredNegatives()));
108    
109                    negExamples.removeAll(posExamples);
110    
111                    int neglimit = (int) Math.round(posExamples.size() * negFactor);
112                    negExamples = SetManipulation.fuzzyShrink(negExamples, neglimit);
113    
114                    return negExamples;
115            }
116    
117            /**
118             * makes positive and negative Examples. positives are a simple retrieval of
119             * the category. negatives are made from parallelclasses.
120             * 
121             * @param targetCategory
122             * @param percentOfSKOSSet
123             *            percentage used from the SKOSSet for training
124             * @param negFactor
125             *            size of the negative Examples compared to the posExample size
126             *            (1.0 means equal size)
127             * @param sparqlResultLimit
128             */
129            public void makeInitialExamples(String targetCategory,
130                            double percentOfSKOSSet, double negFactor, int sparqlResultLimitNegativeExamples,
131                            boolean stable) {
132                    fullPositiveSet.clear();
133                    // fullPosSetWithoutPosExamples.clear();
134                    posExamples.clear();
135                    negExamples.clear();
136    
137                    // POSITIVES
138                    AutomaticPositiveExampleFinderSPARQL apos = new AutomaticPositiveExampleFinderSPARQL(
139                                    sparqlTasks);
140                    apos.makePositiveExamplesFromSKOSConcept(targetCategory);
141                    fullPositiveSet.addAll(apos.getPosExamples());
142    
143                    int poslimit = (int) Math.round(percentOfSKOSSet
144                                    * fullPositiveSet.size());
145                    int neglimit = (int) Math.round(poslimit * negFactor);
146    
147                    posExamples.addAll(SetManipulation.fuzzyShrink(fullPositiveSet, poslimit));
148    
149                    // NEGATIVES
150    
151                    AutomaticNegativeExampleFinderSPARQL aneg = new AutomaticNegativeExampleFinderSPARQL(
152                                    fullPositiveSet, sparqlTasks, new TreeSet<String>());
153    
154                    aneg.makeNegativeExamplesFromParallelClasses(posExamples,
155                                    sparqlResultLimitNegativeExamples);
156                    negExamples = aneg.getNegativeExamples(neglimit, stable);
157    
158                    logger.debug("POSITIVE EXAMPLES");
159                    for (String pos : posExamples) {
160                            logger.debug("+" + pos);
161                    }
162    
163                    logger.debug("NEGATIVE EXAMPLES");
164                    for (String negs : this.negExamples) {
165                            logger.debug("-" + negs);
166                    }
167    
168                    // fullPosSetWithoutPosExamples.addAll(fullPositiveSet);
169                    // fullPosSetWithoutPosExamples.removeAll(posExamples);
170    
171                    // logger.debug(fullPositiveSet);
172    
173                    // logger.debug(fullPosSetWithoutPosExamples);
174    
175            }
176    
177            public SortedSet<String> getPosExamples() {
178                    return posExamples;
179            }
180    
181            public SortedSet<String> getNegExamples() {
182                    return negExamples;
183            }
184    
185            public SortedSet<String> getFullPositiveSet() {
186                    return fullPositiveSet;
187            }
188    
189            public SortedSet<String> getDefinitelyWrongIndividuals() {
190                    return definitelyWrongIndividuals;
191            }
192    
193            public SortedSet<String> getCleanedPositiveSet() {
194                    return cleanedPositiveSet;
195            }
196    
197    }