conduwuit/src/service/rooms/search/mod.rs

use std::sync::Arc;

use conduit::{
	implement,
	utils::{set, stream::TryIgnore, IterStream, ReadyExt},
	Result,
};
use database::Map;
use futures::StreamExt;
use ruma::RoomId;

use crate::{rooms, Dep};

pub struct Service {
	db: Data,
	services: Services,
}

struct Data {
	tokenids: Arc<Map>,
}

struct Services {
	short: Dep<rooms::short::Service>,
}

impl crate::Service for Service {
	fn build(args: crate::Args<'_>) -> Result<Arc<Self>> {
		Ok(Arc::new(Self {
			db: Data {
				tokenids: args.db["tokenids"].clone(),
			},
			services: Services {
				short: args.depend::<rooms::short::Service>("rooms::short"),
			},
		}))
	}

	fn name(&self) -> &str { crate::service::make_name(std::module_path!()) }
}

#[implement(Service)]
pub fn index_pdu(&self, shortroomid: u64, pdu_id: &[u8], message_body: &str) {
	let batch = tokenize(message_body)
		.map(|word| {
			let mut key = shortroomid.to_be_bytes().to_vec();
			key.extend_from_slice(word.as_bytes());
			key.push(0xFF);
			key.extend_from_slice(pdu_id); // TODO: currently we save the room id a second time here
			(key, Vec::<u8>::new())
		})
		.collect::<Vec<_>>();

	self.db.tokenids.insert_batch(batch.iter());
}

#[implement(Service)]
pub fn deindex_pdu(&self, shortroomid: u64, pdu_id: &[u8], message_body: &str) {
	let batch = tokenize(message_body).map(|word| {
		let mut key = shortroomid.to_be_bytes().to_vec();
		key.extend_from_slice(word.as_bytes());
		key.push(0xFF);
		key.extend_from_slice(pdu_id); // TODO: currently we save the room id a second time here
		key
	});

	for token in batch {
		self.db.tokenids.remove(&token);
	}
}

#[implement(Service)]
pub async fn search_pdus(&self, room_id: &RoomId, search_string: &str) -> Option<(Vec<Vec<u8>>, Vec<String>)> {
	let prefix = self
		.services
		.short
		.get_shortroomid(room_id)
		.await
		.ok()?
		.to_be_bytes()
		.to_vec();

	let words: Vec<_> = tokenize(search_string).collect();

	let bufs: Vec<_> = words
		.clone()
		.into_iter()
		.stream()
		.then(move |word| {
			let mut prefix2 = prefix.clone();
			prefix2.extend_from_slice(word.as_bytes());
			prefix2.push(0xFF);
			let prefix3 = prefix2.clone();

			let mut last_possible_id = prefix2.clone();
			last_possible_id.extend_from_slice(&u64::MAX.to_be_bytes());

			self.db.tokenids
				.rev_raw_keys_from(&last_possible_id) // Newest pdus first
				.ignore_err()
				.ready_take_while(move |key| key.starts_with(&prefix2))
				.map(move |key| key[prefix3.len()..].to_vec())
				.collect::<Vec<_>>()
		})
		.collect()
		.await;

	let bufs = bufs.iter().map(|buf| buf.iter());

	let results = set::intersection(bufs).cloned().collect();

	Some((results, words))
}

/// Splits a string into tokens used as keys in the search inverted index
///
/// This may be used to tokenize both message bodies (for indexing) or search
/// queries (for querying).
fn tokenize(body: &str) -> impl Iterator<Item = String> + Send + '_ {
	body.split_terminator(|c: char| !c.is_alphanumeric())
		.filter(|s| !s.is_empty())
		.filter(|word| word.len() <= 50)
		.map(str::to_lowercase)
}
Hot-Reloading Refactor Signed-off-by: Jason Volk <jason@zemos.net> 2024-05-09 15:59:08 -07:00			`use std::sync::Arc;`

merge search service w/ data Signed-off-by: Jason Volk <jason@zemos.net> 2024-10-25 05:22:50 +00:00			`use conduit::{`
			`implement,`
			`utils::{set, stream::TryIgnore, IterStream, ReadyExt},`
			`Result,`
			`};`
			`use database::Map;`
			`use futures::StreamExt;`
fix: some compile time errors Only 174 errors left! 2022-09-06 23:15:09 +02:00			`use ruma::RoomId;`
Fixed more compile time errors 2022-09-07 13:25:51 +02:00
merge search service w/ data Signed-off-by: Jason Volk <jason@zemos.net> 2024-10-25 05:22:50 +00:00			`use crate::{rooms, Dep};`

Hot-Reloading Refactor Signed-off-by: Jason Volk <jason@zemos.net> 2024-05-09 15:59:08 -07:00			`pub struct Service {`
refactor dyn KvTree out of services Signed-off-by: Jason Volk <jason@zemos.net> 2024-06-28 22:51:39 +00:00			`db: Data,`
merge search service w/ data Signed-off-by: Jason Volk <jason@zemos.net> 2024-10-25 05:22:50 +00:00			`services: Services,`
			`}`

			`struct Data {`
			`tokenids: Arc<Map>,`
			`}`

			`struct Services {`
			`short: Dep<rooms::short::Service>,`
refactor: work on search 2022-07-10 16:28:43 +02:00			`}`

impl crate::Service for Service Signed-off-by: Jason Volk <jason@zemos.net> 2024-07-04 03:26:19 +00:00			`impl crate::Service for Service {`
			`fn build(args: crate::Args<'_>) -> Result<Arc<Self>> {`
			`Ok(Arc::new(Self {`
merge search service w/ data Signed-off-by: Jason Volk <jason@zemos.net> 2024-10-25 05:22:50 +00:00			`db: Data {`
			`tokenids: args.db["tokenids"].clone(),`
			`},`
			`services: Services {`
			`short: args.depend::<rooms::short::Service>("rooms::short"),`
			`},`
impl crate::Service for Service Signed-off-by: Jason Volk <jason@zemos.net> 2024-07-04 03:26:19 +00:00			`}))`
devirtualize service Data traits Signed-off-by: Jason Volk <jason@zemos.net> 2024-05-27 03:17:20 +00:00			`}`

impl crate::Service for Service Signed-off-by: Jason Volk <jason@zemos.net> 2024-07-04 03:26:19 +00:00			`fn name(&self) -> &str { crate::service::make_name(std::module_path!()) }`
			`}`

merge search service w/ data Signed-off-by: Jason Volk <jason@zemos.net> 2024-10-25 05:22:50 +00:00			`#[implement(Service)]`
			`pub fn index_pdu(&self, shortroomid: u64, pdu_id: &[u8], message_body: &str) {`
			`let batch = tokenize(message_body)`
			`.map(\|word\| {`
			`let mut key = shortroomid.to_be_bytes().to_vec();`
			`key.extend_from_slice(word.as_bytes());`
			`key.push(0xFF);`
			`key.extend_from_slice(pdu_id); // TODO: currently we save the room id a second time here`
			`(key, Vec::<u8>::new())`
			`})`
			`.collect::<Vec<_>>();`
messing with trait objects 2022-10-05 12:45:54 +02:00
merge search service w/ data Signed-off-by: Jason Volk <jason@zemos.net> 2024-10-25 05:22:50 +00:00			`self.db.tokenids.insert_batch(batch.iter());`
			`}`
fix: de-index pdus when redacted bit of code dedupe as well Co-authored-by: strawberry <strawberry@puppygock.gay> Signed-off-by: strawberry <strawberry@puppygock.gay> 2024-06-12 00:33:12 -04:00
merge search service w/ data Signed-off-by: Jason Volk <jason@zemos.net> 2024-10-25 05:22:50 +00:00			`#[implement(Service)]`
			`pub fn deindex_pdu(&self, shortroomid: u64, pdu_id: &[u8], message_body: &str) {`
			`let batch = tokenize(message_body).map(\|word\| {`
			`let mut key = shortroomid.to_be_bytes().to_vec();`
			`key.extend_from_slice(word.as_bytes());`
			`key.push(0xFF);`
			`key.extend_from_slice(pdu_id); // TODO: currently we save the room id a second time here`
			`key`
			`});`

			`for token in batch {`
			`self.db.tokenids.remove(&token);`
improvement: device list works better The only situation that isn't working yet is sending `left` events for users when the sender leaves the room 2020-08-21 21:22:59 +02:00			`}`
refactor: work on search 2022-07-10 16:28:43 +02:00			`}`
merge search service w/ data Signed-off-by: Jason Volk <jason@zemos.net> 2024-10-25 05:22:50 +00:00
			`#[implement(Service)]`
			`pub async fn search_pdus(&self, room_id: &RoomId, search_string: &str) -> Option<(Vec<Vec<u8>>, Vec<String>)> {`
			`let prefix = self`
			`.services`
			`.short`
			`.get_shortroomid(room_id)`
			`.await`
			`.ok()?`
			`.to_be_bytes()`
			`.to_vec();`

			`let words: Vec<_> = tokenize(search_string).collect();`

			`let bufs: Vec<_> = words`
			`.clone()`
			`.into_iter()`
			`.stream()`
			`.then(move \|word\| {`
			`let mut prefix2 = prefix.clone();`
			`prefix2.extend_from_slice(word.as_bytes());`
			`prefix2.push(0xFF);`
			`let prefix3 = prefix2.clone();`

			`let mut last_possible_id = prefix2.clone();`
			`last_possible_id.extend_from_slice(&u64::MAX.to_be_bytes());`

			`self.db.tokenids`
			`.rev_raw_keys_from(&last_possible_id) // Newest pdus first`
			`.ignore_err()`
			`.ready_take_while(move \|key\| key.starts_with(&prefix2))`
			`.map(move \|key\| key[prefix3.len()..].to_vec())`
			`.collect::<Vec<_>>()`
			`})`
			`.collect()`
			`.await;`

			`let bufs = bufs.iter().map(\|buf\| buf.iter());`

			`let results = set::intersection(bufs).cloned().collect();`

			`Some((results, words))`
			`}`

			`/// Splits a string into tokens used as keys in the search inverted index`
			`///`
			`/// This may be used to tokenize both message bodies (for indexing) or search`
			`/// queries (for querying).`
			`fn tokenize(body: &str) -> impl Iterator<Item = String> + Send + '_ {`
			`body.split_terminator(\|c: char\| !c.is_alphanumeric())`
			`.filter(\|s\| !s.is_empty())`
			`.filter(\|word\| word.len() <= 50)`
			`.map(str::to_lowercase)`
			`}`