%PDF- %PDF-
| Direktori : /home/waritko/yacy/source/net/yacy/cora/federate/solr/connector/ |
| Current File : //home/waritko/yacy/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java |
/**
* AbstractSolrConnector
* Copyright 2012 by Michael Peter Christen
* First released 27.06.2012 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate.solr.connector;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.FacetField;
import org.apache.solr.client.solrj.response.FacetField.Count;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.DisMaxParams;
import org.apache.solr.common.params.FacetParams;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.LookAheadIterator;
import net.yacy.kelondro.data.word.Word;
import net.yacy.search.schema.CollectionSchema;
public abstract class AbstractSolrConnector implements SolrConnector {
protected static Set<String> SOLR_ID_FIELDS = new HashSet<String>();
protected static Set<String> SOLR_ID_and_LOAD_DATE_FIELDS = new HashSet<String>();
static {
SOLR_ID_FIELDS.add(CollectionSchema.id.getSolrFieldName());
SOLR_ID_and_LOAD_DATE_FIELDS.add(CollectionSchema.id.getSolrFieldName());
SOLR_ID_and_LOAD_DATE_FIELDS.add(CollectionSchema.load_date_dt.getSolrFieldName());
}
public final static SolrDocument POISON_DOCUMENT = new SolrDocument();
public final static String POISON_ID = "POISON_ID";
public final static String CATCHALL_TERM = "[* TO *]";
public final static String CATCHALL_DTERM = ":" + CATCHALL_TERM;
public final static String CATCHALL_QUERY = "*:*";
public final static SolrQuery catchallQuery = new SolrQuery();
static {
catchallQuery.setQuery(CATCHALL_QUERY);
catchallQuery.setFields(CollectionSchema.id.getSolrFieldName());
catchallQuery.setRows(0);
catchallQuery.setStart(0);
}
public final static SolrQuery catchSuccessQuery = new SolrQuery();
static {
//catchSuccessQuery.setQuery("-" + CollectionSchema.failreason_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);
catchSuccessQuery.setQuery(CATCHALL_QUERY); // failreason_s is only available for core collection1
catchSuccessQuery.setFields(CollectionSchema.id.getSolrFieldName());
catchSuccessQuery.clearSorts();
catchSuccessQuery.setIncludeScore(false);
catchSuccessQuery.setRows(0);
catchSuccessQuery.setStart(0);
}
protected final static int pagesize_docs = 100;
protected final static int pagesize_ids = 1000;
protected static LoadTimeURL getLoadTimeURL(final Object doc) {
if (doc == null) return null;
Object d = null;
String url = null;
if (doc instanceof SolrInputDocument) {
d = ((SolrInputDocument) doc).getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
url = (String) ((SolrInputDocument) doc).getFieldValue(CollectionSchema.sku.getSolrFieldName());
}
if (doc instanceof SolrDocument) {
d = ((SolrDocument) doc).getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
url = (String) ((SolrDocument) doc).getFieldValue(CollectionSchema.sku.getSolrFieldName());
}
if (doc instanceof org.apache.lucene.document.Document) {
String ds = ((org.apache.lucene.document.Document) doc).get(CollectionSchema.load_date_dt.getSolrFieldName());
try {
d = Long.parseLong(ds);
} catch (NumberFormatException e) {
d = -1l;
}
url = ((org.apache.lucene.document.Document) doc).get(CollectionSchema.sku.getSolrFieldName());
}
if (d == null) return null;
long date = -1;
if (d instanceof Long) date = ((Long) d).longValue();
if (d instanceof Date) date = ((Date) d).getTime();
return new LoadTimeURL(url, date);
}
/**
* check if fields contain id and load_date_dt date
* @param fields
* @return fields with added id and load_date_dt if necessary
*/
protected static String[] ensureEssentialFieldsIncluded(String[] fields) {
if (fields != null && fields.length > 0) {
Set<String> f = new HashSet<String>();
for (String s: fields) f.add(s);
f.add(CollectionSchema.id.getSolrFieldName());
f.add(CollectionSchema.load_date_dt.getSolrFieldName());
fields = f.toArray(new String[f.size()]);
}
return fields;
}
/**
* Get results from a solr query as a stream of documents.
* The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned.
* The method returns immediately and feeds the search results into the queue
* @param querystring the solr query string
* @param sort the solr sort string, may be null to be not used
* @param offset first result offset
* @param maxcount the maximum number of results
* @param maxtime the maximum time in milliseconds
* @param buffersize the size of an ArrayBlockingQueue; if <= 0 then a LinkedBlockingQueue is used
* @param concurrency is the number of AbstractSolrConnector.POISON_DOCUMENT entries to add at the end of the feed
* @param prefetchIDs if true, then first all IDs are fetched and then all documents are queries by the ID. If false then documents are retrieved directly
* @param fields list of fields
* @return a blocking queue which is terminated with AbstractSolrConnector.POISON_DOCUMENT as last element
*/
@Override
public BlockingQueue<SolrDocument> concurrentDocumentsByQuery(
final String querystring,
final String sort,
final int offset,
final int maxcount,
final long maxtime,
final int buffersize,
final int concurrency,
final boolean prefetchIDs,
final String ... fields) {
List<String> querystrings = new ArrayList<>(1);
querystrings.add(querystring);
return concurrentDocumentsByQueries(querystrings, sort, offset, maxcount, maxtime, buffersize, concurrency, prefetchIDs, fields);
}
/**
* {@inheritDoc}
*/
@Override
public BlockingQueue<SolrDocument> concurrentDocumentsByQueries(
final List<String> querystrings,
final String sort,
final int offset,
final int maxcount,
final long maxtime,
final int buffersize,
final int concurrency,
final boolean prefetchIDs,
final String ... fields) {
final BlockingQueue<SolrDocument> queue = buffersize <= 0 ? new LinkedBlockingQueue<SolrDocument>() : new ArrayBlockingQueue<SolrDocument>(Math.max(buffersize, concurrency));
if (!prefetchIDs) {
final Thread t = new Thread(newDocumentsByQueriesTask(queue, querystrings, sort, offset, maxcount, maxtime, buffersize, concurrency, fields));
t.start();
return queue;
}
final BlockingQueue<String> idQueue = concurrentIDsByQueries(querystrings, sort, offset, maxcount, maxtime, Math.min(maxcount, 10000000), concurrency);
final long endtime = maxtime < 0 || maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime; // we know infinity!
final Thread[] t = new Thread[concurrency];
for (int i = 0; i < Math.max(1, concurrency); i++) {
t[i] = new Thread("AbstractSolrConnector:concurrentDocumentsByQueriesWithPrefetch(" + querystrings.size() + " queries, first: " + querystrings.iterator().next() + ")") {
@Override
public void run() {
String nextID;
try {
while (System.currentTimeMillis() < endtime && (nextID = idQueue.take()) != AbstractSolrConnector.POISON_ID) {
try {
SolrDocument d = getDocumentById(nextID, fields);
// document may be null if another process has deleted the document meanwhile
// in case that the document is absent then, we silently ignore that case
if (d != null) try {queue.put(d);} catch (final InterruptedException e) {}
} catch (final SolrException | IOException e) {
ConcurrentLog.logException(e);
// fail
ConcurrentLog.severe("AbstractSolrConnector", "aborted concurrentDocumentsByQuery: " + e.getMessage());
break;
}
}
} catch (InterruptedException e) {
ConcurrentLog.severe("AbstractSolrConnector", "interrupted concurrentDocumentsByQuery: " + e.getMessage());
}
try {queue.put(AbstractSolrConnector.POISON_DOCUMENT);} catch (final InterruptedException e1) {}
}
};
t[i].start();
}
return queue;
}
@Override
public Runnable newDocumentsByQueriesTask(
final BlockingQueue<SolrDocument> queue,
final List<String> querystrings,
final String sort,
final int offset,
final int maxcount,
final long maxtime,
final int buffersize,
final int concurrency,
final String ... fields) {
Objects.requireNonNull(queue, "The queue parameter must not be null.");
if (querystrings == null || querystrings.isEmpty()) {
return () -> {
for (int i = 0; i < Math.max(1, concurrency); i++) {
try {
queue.put(AbstractSolrConnector.POISON_DOCUMENT);
} catch (final InterruptedException e1) {
Thread.currentThread().interrupt(); // preserve interrupted thread state
}
}
};
}
final long endtime = maxtime < 0 || maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime; // we know infinity!
final int ps = buffersize < 0 ? pagesize_docs : Math.min(pagesize_docs, buffersize);
final int maxretries = 6;
return () -> {
long remainingTime = endtime - System.currentTimeMillis();
try {
for (final String querystring: querystrings) {
Thread.currentThread().setName("AbstractSolrConnector:concurrentDocumentsByQueryNoPrefetch(" + querystring + ")");
int o = offset;
int count = 0;
int retry = 0;
loop: while (remainingTime > 0 && count < maxcount) {
try {
final SolrDocumentList sdl = getDocumentListByQuery(querystring, sort, o, Math.min(maxcount, ps), fields);
for (final SolrDocument d: sdl) {
if (endtime != Long.MAX_VALUE) {
/*
* A timeout is defined : we must not use here queue.put() otherwise this
* thread could indefinitely wait here when the queue is full and the
* consumer thread has stopped taking in the queue.
*/
if (!queue.offer(d, remainingTime, TimeUnit.MILLISECONDS)) {
break;
}
} else {
queue.put(d);
}
count++;
}
if (sdl.size() < ps) {
break loop; // finished
}
o += sdl.size();
retry = 0;
} catch(final InterruptedIOException e) {
throw new InterruptedException(); // rethrow to finish the process
} catch (final SolrException | IOException e) {
ConcurrentLog.logException(e);
if (retry++ < maxretries) {
// remote Solr may be temporary down, so we wait a bit
Thread.sleep(100);
continue loop;
}
// fail
ConcurrentLog.severe("AbstractSolrConnector", "aborted concurrentDocumentsByQueryNoPrefetch after " + maxretries + " retries: " + e.getMessage());
break;
}
remainingTime = endtime - System.currentTimeMillis();
}
}
} catch(final InterruptedException e) {
Thread.currentThread().interrupt(); // preserve interrupted thread state
} catch (final RuntimeException e) {
ConcurrentLog.logException(e);
} finally {
/* Add poison elements only when the thread has not been interrupted */
for (int i = 0; i < Math.max(1, concurrency); i++) {
try {
queue.put(AbstractSolrConnector.POISON_DOCUMENT);
} catch (final InterruptedException e1) {
Thread.currentThread().interrupt(); // preserve interrupted thread state
break; // thread is interrupted : in that case we no more try to add poison elements to the queue
}
}
}
};
}
/**
* get a document id result stream from a solr query.
* The result queue is considered as terminated if AbstractSolrConnector.POISON_ID is returned.
* The method returns immediately and feeds the search results into the queue
* @param querystring
* @param sort the solr sort string, may be null to be not used
* @param offset
* @param maxcount
* @param buffersize the size of an ArrayBlockingQueue; if <= 0 then a LinkedBlockingQueue is used
* @param concurrency is the number of AbstractSolrConnector.POISON_ID entries to add at the end of the feed
* @return a list of ids in q blocking queue which is terminated with a number of AbstractSolrConnector.POISON_ID
*/
@Override
public BlockingQueue<String> concurrentIDsByQuery(
final String querystring,
final String sort,
final int offset,
final int maxcount,
final long maxtime,
final int buffersize,
final int concurrency) {
List<String> querystrings = new ArrayList<>(1);
querystrings.add(querystring);
return concurrentIDsByQueries(querystrings, sort, offset, maxcount, maxtime, buffersize, concurrency);
}
/**
* get a document id result stream from a set of solr queries.
* The result queue is considered as terminated if AbstractSolrConnector.POISON_ID is returned.
* The method returns immediately and feeds the search results into the queue
* @param querystring a list of query strings
* @param sort the solr sort string, may be null to be not used
* @param offset common offset of all queries
* @param maxcount maximum count for each query
* @param buffersize the size of an ArrayBlockingQueue; if <= 0 then a LinkedBlockingQueue is used
* @param concurrency is the number of AbstractSolrConnector.POISON_ID entries to add at the end of the feed
* @return a list of ids in q blocking queue which is terminated with a number of AbstractSolrConnector.POISON_ID
*/
@Override
public BlockingQueue<String> concurrentIDsByQueries(
final List<String> querystrings,
final String sort,
final int offset,
final int maxcount,
final long maxtime,
final int buffersize,
final int concurrency) {
final BlockingQueue<String> queue = buffersize <= 0 ? new LinkedBlockingQueue<String>() : new ArrayBlockingQueue<String>(buffersize);
final long endtime = maxtime < 0 || maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime; // we know infinity!
final Thread t = new Thread() {
@Override
public void run() {
try {
for (String querystring: querystrings) {
this.setName("AbstractSolrConnector:concurrentIDsByQueries(" + querystring + ")");
int o = offset;
while (System.currentTimeMillis() < endtime) {
try {
SolrDocumentList sdl = getDocumentListByQuery(querystring, sort, o, maxcount < 0 ? pagesize_ids : Math.min(maxcount, pagesize_ids), CollectionSchema.id.getSolrFieldName());
int count = 0;
for (SolrDocument d: sdl) {
try {queue.put((String) d.getFieldValue(CollectionSchema.id.getSolrFieldName()));} catch (final InterruptedException e) {break;}
count++;
}
if (count < pagesize_ids) break;
o += count;
if (o > maxcount && maxcount > 0) break;
} catch (final SolrException e) {
break;
} catch (final IOException e) {
break;
}
}
}
} catch (Throwable e) {} finally {
for (int i = 0; i < concurrency; i++) {
try {queue.put(AbstractSolrConnector.POISON_ID);} catch (final InterruptedException e1) {}
}
}
}
};
t.start();
return queue;
}
@Override
public Iterator<String> iterator() {
final BlockingQueue<String> queue = concurrentIDsByQuery(CATCHALL_QUERY, null, 0, Integer.MAX_VALUE, 60000, 2 * pagesize_ids, 1);
return new LookAheadIterator<String>() {
@Override
protected String next0() {
try {
String s = queue.poll(60000, TimeUnit.MILLISECONDS);
if (s == AbstractSolrConnector.POISON_ID) return null;
return s;
} catch (final InterruptedException e) {
return null;
}
}
};
}
/**
* get a query result from solr
* to get all results set the query String to "*:*"
* @param querystring
* @throws IOException
*/
@Override
public SolrDocumentList getDocumentListByQuery(
final String querystring,
final String sort,
final int offset,
final int count,
final String ... fields) throws IOException {
// construct query
final SolrQuery params = getSolrQuery(querystring, sort, offset, count, fields);
// query the server
final SolrDocumentList docs = getDocumentListByParams(params);
return docs;
}
public static SolrQuery getSolrQuery(
final String querystring,
final String sort,
final int offset,
final int count,
final String ... fields) {
// construct query
final SolrQuery params = new SolrQuery();
//if (count < 2 && querystring.startsWith("{!raw f=")) {
// params.setQuery("*:*");
// params.addFilterQuery(querystring);
//} else {
params.setQuery(querystring);
//}
params.clearSorts();
if (sort != null) {
params.set(CommonParams.SORT, sort);
}
params.setRows(count);
params.setStart(offset);
params.setFacet(false);
if (fields != null && fields.length > 0) params.setFields(fields);
params.setIncludeScore(false);
params.setParam("defType", "edismax");
params.setParam(DisMaxParams.QF, CollectionSchema.text_t.getSolrFieldName() + "^1.0");
return params;
}
/**
* check if a given document, identified by url hash as document id exists
* @param id the url hash and document id
* @return metadata if any entry in solr exists, null otherwise
* @throws IOException
*/
@Override
public LoadTimeURL getLoadTimeURL(String id) throws IOException {
// construct raw query
final SolrQuery params = new SolrQuery();
//params.setQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + id + "\"");
String q = "{!cache=false raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id;
params.setQuery(q);
params.setRows(1);
params.setStart(0);
params.setFacet(false);
params.clearSorts();
params.setFields(CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.load_date_dt.getSolrFieldName());
params.setIncludeScore(false);
// query the server
final SolrDocumentList sdl = getDocumentListByParams(params);
if (sdl == null || sdl.getNumFound() <= 0) return null;
SolrDocument doc = sdl.iterator().next();
LoadTimeURL md = getLoadTimeURL(doc);
return md;
}
/**
* get the number of results when this query is done.
* This should only be called if the actual result is never used, and only the count is interesting
* @param querystring
* @return the number of results for this query
*/
@Override
public long getCountByQuery(String querystring) throws IOException {
// construct query
final SolrQuery params = new SolrQuery();
params.setQuery(querystring);
params.setRows(0); // essential to just get count
params.setStart(0);
params.setFacet(false);
params.clearSorts();
params.setFields(CollectionSchema.id.getSolrFieldName());
params.setIncludeScore(false);
// query the server
final SolrDocumentList sdl = getDocumentListByParams(params);
return sdl == null ? 0 : sdl.getNumFound();
}
/**
* get facets of the index: a list of lists with values that are most common in a specific field
* @param query a query which is performed to get the facets
* @param fields the field names which are selected as facet
* @param maxresults the maximum size of the resulting maps
* @return a map with key = facet field name, value = an ordered map of field values for that field
* @throws IOException
*/
@Override
public LinkedHashMap<String, ReversibleScoreMap<String>> getFacets(String query, int maxresults, final String ... fields) throws IOException {
// construct query
assert fields.length > 0;
final SolrQuery params = new SolrQuery();
params.setQuery(query);
params.setRows(0);
params.setStart(0);
params.setFacet(true);
params.setFacetMinCount(1); // there are many 0-count facets in the uninverted index cache
params.setFacetLimit(maxresults);
params.setFacetSort(FacetParams.FACET_SORT_COUNT);
params.setParam(FacetParams.FACET_METHOD, FacetParams.FACET_METHOD_enum); // fight the fieldcache
params.setFields(fields);
params.clearSorts();
params.setIncludeScore(false);
for (String field: fields) params.addFacetField(field);
// query the server
QueryResponse rsp = getResponseByParams(params);
LinkedHashMap<String, ReversibleScoreMap<String>> facets = new LinkedHashMap<String, ReversibleScoreMap<String>>(fields.length);
for (String field: fields) {
FacetField facet = rsp.getFacetField(field);
ReversibleScoreMap<String> result = new ClusteredScoreMap<String>(UTF8.insensitiveUTF8Comparator);
List<Count> values = facet.getValues();
if (values == null) continue;
for (Count ff: values) if (ff.getCount() > 0) result.set(ff.getName(), (int) ff.getCount());
facets.put(field, result);
}
return facets;
}
@Override
public SolrDocument getDocumentById(final String id, final String ... fields) throws IOException {
assert id.length() == Word.commonHashLength : "wrong id: " + id;
final SolrQuery query = new SolrQuery();
// construct query
StringBuilder sb = new StringBuilder(23);
sb.append("{!cache=false raw f=").append(CollectionSchema.id.getSolrFieldName()).append('}').append(id);
query.setQuery(sb.toString());
//query.setQuery("*:*");
//query.addFilterQuery(sb.toString());
query.clearSorts();
query.setRows(1);
query.setStart(0);
if (fields != null && fields.length > 0) query.setFields(fields);
query.setIncludeScore(false);
// query the server
try {
final SolrDocumentList docs = getDocumentListByParams(query);
if (docs == null || docs.isEmpty()) return null;
SolrDocument doc = docs.get(0);
return doc;
} catch (final Throwable e) {
clearCaches(); // we clear the in case that this is caused by OOM
throw new IOException(e.getMessage(), e);
}
}
/**
* Update a solr document.
* This will write only a partial update for all fields given in the SolrInputDocument
* and leaves all other fields untouched.
* @param solrdoc
* @throws IOException
* @throws SolrException
*/
@Override
public void update(final SolrInputDocument solrdoc) throws IOException, SolrException {
this.add(partialUpdatePatch(solrdoc));
}
/**
* Update a collection of solr input documents.
* This will write only a partial update for all fields given in the SolrInputDocuments
* and leaves all other fields untouched.
* @param solrdocs
* @throws IOException
* @throws SolrException
*/
@Override
public void update(final Collection<SolrInputDocument> solrdoc) throws IOException, SolrException {
Collection<SolrInputDocument> docs = new ArrayList<>(solrdoc.size());
for (SolrInputDocument doc: solrdoc) docs.add(partialUpdatePatch(doc));
this.add(docs);
}
private SolrInputDocument partialUpdatePatch(final SolrInputDocument docIn) {
SolrInputDocument docOut = new SolrInputDocument();
docOut.setField(CollectionSchema.id.name(), docIn.getFieldValue(CollectionSchema.id.name()));
for (Entry<String, SolrInputField> entry: docIn.entrySet()) {
if (entry.getKey().equals(CollectionSchema.id.name())) continue;
SolrInputField sif = entry.getValue();
Map<String, Object> partialUpdate = new HashMap<>(1);
Object value = sif.getValue();
docOut.removeField(entry.getKey());
partialUpdate.put("set", value);
docOut.setField(entry.getKey(), partialUpdate);
}
return docOut;
}
}