All Distance Sketch  0.1
All distance sketch based algorithms
graph_sketch.h
1 #ifndef SRC_SKETCH_GRAPH_SKETCH_H_
2 #define SRC_SKETCH_GRAPH_SKETCH_H_
3 
4 #include "node_sketch.h"
5 #include "utils.h"
6 
7 namespace all_distance_sketch {
8 
11 typedef std::vector<NodeSketch> NodesSketch;
14 typedef NodesSketch::iterator NodesSketchItr;
17 class GraphSketch {
18  public:
28  void InitGraphSketch(unsigned int K, int max_node_id,
29  const std::vector<int>* nodes_id = NULL) {
30  K_ = K;
31  should_calc_z_value_ = false;
32  nodes_ads_.resize(max_node_id);
33  reserve_size_ = K_ * log2(max_node_id);
34  prunning_thresholds_.resize(max_node_id);
35  CreateNodesDistribution(max_node_id, NULL, nodes_id);
36  }
37 
38  int GetK() const { return K_; }
41  void set_should_calc_zvalues(bool should_calc) {
42  should_calc_z_value_ = should_calc;
43  }
44 
45 #if PROTO_BUF
46 
49  const AllDistanceSketchGpb& all_distance_sketch) {
50  if (all_distance_sketch.has_configuration()) {
51  K_ = all_distance_sketch.configuration().k();
52  }
53  if (all_distance_sketch.has_configuration()) {
54  should_calc_z_value_ = all_distance_sketch.configuration().should_calc_z_values();
55  }
56  // Load the thresholds
57  int num_nodes = all_distance_sketch.node_thresholds_size();
58  prunning_thresholds_.resize(num_nodes);
59  nodes_random_id_.resize(num_nodes);
60  if (LoadThresholdsAndRandomId(all_distance_sketch) == false) {
61  LOG_M(NOTICE, "Unable to load threshold and random ID");
62  return false;
63  }
64  reserve_size_ = K_ * log2(prunning_thresholds_.size());
65  if (LoadNodesSketch(all_distance_sketch) == false) {
66  LOG_M(NOTICE, "Unable to load nodes NodeSketch");
67  return false;
68  }
69 
70  CalculateAllDistanceNeighborhood();
71  return true;
72  }
75  void SaveGraphSketchToGpb(AllDistanceSketchGpb* all_distance_sketch) {
76  all_distance_sketch->mutable_configuration()->set_k(K_);
77  // Save the prunning_thresholds_ & random ids
78  SaveThresholdAndRandomId(all_distance_sketch);
79  // Save each node NodeSketch
80  SaveNodesSketch(all_distance_sketch);
81  }
82 
85  bool LoadThresholdsAndRandomId(const AllDistanceSketchGpb& all_distance_sketch) {
86  int num_nodes = all_distance_sketch.node_thresholds_size();
87  prunning_thresholds_.resize(num_nodes);
88  nodes_random_id_.resize(num_nodes);
89  int node_id;
90  for (int i=0; i < num_nodes; i++) {
91  if (all_distance_sketch.node_thresholds(i).has_node_id() == false ||
92  all_distance_sketch.node_thresholds(i).has_threshold() == false ||
93  all_distance_sketch.node_thresholds(i).has_random_id() == false){
94  LOG_M(NOTICE, "Load from bad file!!!, missing data " <<
95  " node id == " << all_distance_sketch.node_thresholds(i).has_node_id() <<
96  " threshold == " << all_distance_sketch.node_thresholds(i).has_threshold() <<
97  " random id == " << all_distance_sketch.node_thresholds(i).has_random_id());
98  return false;
99  }
100  node_id = all_distance_sketch.node_thresholds(i).node_id();
101  if (node_id >= prunning_thresholds_.size() ||
102  node_id >= nodes_random_id_.size()) {
103  prunning_thresholds_.resize(node_id);
104  nodes_random_id_.resize(node_id);
105  }
106  prunning_thresholds_[node_id].SetDistance(
107  all_distance_sketch.node_thresholds(i).threshold());
108  nodes_random_id_[node_id] =
109  all_distance_sketch.node_thresholds(i).random_id();
110  }
111  ExtractSortedVersionFromDist();
112  return true;
113  }
114 
115  bool LoadNodesSketch(const AllDistanceSketchGpb& all_distance_sketch) {
116  for (int i=0; i < all_distance_sketch.nodes_sketches_size(); i++) {
117  if (all_distance_sketch.nodes_sketches(i).has_node_id() == false ||
118  all_distance_sketch.nodes_sketches(i).has_node_random_id() == false) {
119  LOG_M(NOTICE,
120  "Loading nodes NodeSketch, Load from bad file!!! missing "
121  "data "
122  << " node id == "
123  << all_distance_sketch.nodes_sketches(i).has_node_id()
124  << " random id == "
125  << all_distance_sketch.nodes_sketches(i)
126  .has_node_random_id());
127  return false;
128 
129  }
130  int node_id = all_distance_sketch.nodes_sketches(i).node_id();
131  double node_random_id = all_distance_sketch.nodes_sketches(i).node_random_id();
132  if (node_id >= nodes_ads_.size()) {
133  nodes_ads_.resize(node_id + 1);
134  }
135  // Init the NodeSketch
136  nodes_ads_[node_id].InitNodeSketch(K_, node_id, node_random_id,
137  &prunning_thresholds_,
138  reserve_size_,
139  should_calc_z_value_);
140 
141  // Load All nodes in the sketch
142  for (int k=0;
143  k < all_distance_sketch.nodes_sketches(i).node_in_sketch_size();
144  k++) {
145  const NodeSummaryDetailsGpb& node_in_sketch = \
146  all_distance_sketch.nodes_sketches(i).node_in_sketch(k);
147  if (node_in_sketch.has_node_id() == false ||
148  node_in_sketch.has_distance() == false) {
149  LOG_M(
150  NOTICE,
151  "Loading nodes NodeSketch, Load from bad file!!! missing "
152  "data "
153  << " node id == " << node_in_sketch.has_node_id()
154  << " distance == " << node_in_sketch.has_distance());
155  return false;
156  }
157  int sketch_node_id = node_in_sketch.node_id();
158  graph::EdgeWeight distance = node_in_sketch.distance();
159  NodeIdDistanceData node_in_sketch_details(sketch_node_id, distance);
160  nodes_ads_[node_id].nodes_id_distance_.push_back(
161  node_in_sketch_details);
162  }
163  ZValues z;
164  for (int j=0;
165  j < all_distance_sketch.nodes_sketches(i).z_value_size(); ++j) {
166  const ZValuesGpb& z_value_in_sketch = \
167  all_distance_sketch.nodes_sketches(i).z_value(j);
168  graph::EdgeWeight distance = z_value_in_sketch.distance();
169  int node_id = z_value_in_sketch.node_id();
170  NodeIdDistanceData d(node_id, distance);
171  z[distance] = d;
172  }
173  nodes_ads_[node_id].set_z_values(&z);
174  }
175  SetDisributionToNodes();
176  return true;
177  }
178 
179  void SaveThresholdAndRandomId(AllDistanceSketchGpb* all_distance_sketch) {
180  if (nodes_random_id_.size() != prunning_thresholds_.size()) {
181  LOG_M(NOTICE," PROBLEM threshold and random ids are not equal ");
182  return;
183  }
184  for (int i = 0; i < prunning_thresholds_.size(); i++) {
185  NodeThresholdGpb *node_threshold = all_distance_sketch->add_node_thresholds();
186  node_threshold->set_node_id(i);
187  node_threshold->set_threshold(prunning_thresholds_[i].GetDistance());
188  node_threshold->set_random_id(nodes_random_id_[i]);
189  }
190  }
191 
192  void SaveNodesSketch(AllDistanceSketchGpb* all_distance_sketch) {
193  for (int i = 0; i < nodes_random_id_.size(); i++) {
194  if (nodes_ads_[i].IsInit() == false) {
195  continue;
196  }
197  // Setting up the node details
198  SingleNodeSketchGpb* single_node_sketch = all_distance_sketch->add_nodes_sketches();
199  single_node_sketch->set_node_id(i);
200  single_node_sketch->set_node_random_id(nodes_random_id_[i]);
201  // Getting the node sketch
202  const NodeIdDistanceVector* nodes_in_sketch =
203  &(nodes_ads_[i].nodes_id_distance_);
204  for (int j=0; j < nodes_in_sketch->size(); j++) {
205  NodeSummaryDetailsGpb * node_details = single_node_sketch->add_node_in_sketch();
206  node_details->set_node_id((*nodes_in_sketch)[j].GetNId());
207  node_details->set_distance((*nodes_in_sketch)[j].GetDistance());
208  }
209  for (const auto zvalue : nodes_ads_[i].GetZValues() ) {
210  ZValuesGpb* z_value = single_node_sketch->add_z_value();
211  z_value->set_distance(zvalue.first);
212  z_value->set_node_id(zvalue.second.GetNId());
213  }
214  }
215  }
216 
217 #endif
218  bool ShouldPrune(graph::EdgeWeight distance, int node_id) {
219  if (distance > prunning_thresholds_[node_id].GetDistance()) {
220  return true;
221  }
222  if (should_calc_z_value_) {
223  return (distance == prunning_thresholds_[node_id].GetDistance() &&
224  nodes_ads_[node_id].HasZValue(distance));
225  }
226  return distance >= prunning_thresholds_[node_id].GetDistance();
227  }
228 
229  void CalculateInsertProb(int node_id, std::vector<NodeProb> * insert_prob) {
230  nodes_ads_[node_id].CalculateInsertProb(insert_prob);
231  }
232 
233  void SetPrunningThresholds() {
234  for (unsigned int i = 0; i < nodes_ads_.size(); i++) {
235  nodes_ads_[i].SetPrunningThresholds(&prunning_thresholds_);
236  }
237  }
238 
239  NodeSketch* UTGetNodeSketch(int node_id) { return &nodes_ads_[node_id]; }
242  NodeSketch* GetNodeSketch(const NodeIdRandomIdData& node_details) {
243  if (nodes_ads_.size() <= (unsigned int)node_details.GetNId()) {
244  return NULL;
245  }
246  if (nodes_ads_[node_details.GetNId()].IsInit() == false) {
247  nodes_ads_[node_details.GetNId()].InitNodeSketch(
248  K_, node_details.GetNId(), node_details.GetRandomId(),
249  &prunning_thresholds_, reserve_size_, should_calc_z_value_);
250  }
251  return &nodes_ads_[node_details.GetNId()];
252  }
255  void Printthresholds() {
256  for (unsigned int i = 0; i < prunning_thresholds_.size(); i++) {
257  std::cout << "i=" << i
258  << " thresholds=" << prunning_thresholds_[i].GetDistance()
259  << std::endl;
260  }
261  }
262 
263  int InsertCandidatesNodes() {
264  int numCleaned = 0;
265  for (unsigned int i = 0; i < nodes_ads_.size(); i++) {
266  numCleaned += nodes_ads_[i].InsertCandidatesNodes();
267  }
268  return numCleaned;
269  }
270 
271  int InsertCandidatesNodes(unsigned int start, unsigned int end) {
272  int numCleaned = 0;
273  for (unsigned int i = start; (i < nodes_ads_.size() && i < end); i++) {
274  numCleaned += nodes_ads_[i].InsertCandidatesNodes();
275  }
276  return numCleaned;
277  }
278 
279  void CalculateAllDistanceNeighborhood() {
280  for (unsigned int i = 0; i < nodes_ads_.size(); i++) {
281  nodes_ads_[i].CalculateAllDistanceNeighborhood();
282  }
283  }
284 
285  const std::vector<PrunningThreshold>* GetThresholds() const {
286  return &prunning_thresholds_;
287  }
288 
289  const NodesSketch& GetNodesSketch() const { return nodes_ads_; }
290 
291  bool operator==(const GraphSketch& other) const {
292  if (GetK() != other.GetK()) {
293  LOG_M(NOTICE, "K is different!" <<
294  "lhs = " << GetK() <<
295  "rhs = " << other.GetK());
296  return false;
297  }
298  if ((*GetNodesDistributionLean()) != (*other.GetNodesDistributionLean())) {
299  LOG_M(NOTICE, "Distribution are not euqal!");
300  return false;
301  }
302 
303  if ( (*GetThresholds()) != (*other.GetThresholds())) {
304  LOG_M(NOTICE, "thresholds are not equal!");
305  return false;
306  }
307 
308  if ((*GetNodesDistribution()) != (*GetNodesDistribution())) {
309  LOG_M(NOTICE, "Distribution sorted are not equal!");
310  return false;
311  }
312 
313  if (GetNodesSketch() != other.GetNodesSketch()) {
314  LOG_M(NOTICE, "Nodes NodeSketch are not equal!");
315  return false;
316  }
317  return true;
318  }
319 
320  void CreateNodesDistribution(unsigned int max_node_id,
321  UniformRankCalculator* calculator = NULL,
322  const std::vector<int>* nodes_id = NULL) {
323  UniformRankCalculator c;
324  c.InitUniformRankCalculator();
325  if (calculator == NULL ) {
326  calculator = &c;
327  }
328  typedef boost::minstd_rand base_generator_type;
329  base_generator_type generator(42u);
330  boost::uniform_real<> uni_dist(0, 1);
331  boost::variate_generator<base_generator_type&, boost::uniform_real<> > uni(generator, uni_dist);
332  nodes_random_id_.resize(max_node_id);
333  int num_nodes = nodes_id == NULL ? max_node_id : nodes_id->size();
334  for (unsigned int i = 0 ; i < num_nodes; i++) {
335  int node_id = nodes_id == NULL ? i : (*nodes_id)[i];
336  nodes_random_id_[node_id] =
337  calculator->CalculateNodeRank(node_id); // uni();
338  }
339 
340  ExtractSortedVersionFromDist();
341  SetDisributionToNodes();
342  /*
343  for (unsigned int i=0; i < num_nodes; i++) {
344  int node_id = nodes_id == NULL ? i : (*nodes_id)[i];
345  NodeDistanceIdRandomIdData b(0, node_id, nodes_random_id_[node_id]);
346  nodes_random_id_sorted_increasing_.push_back(b);
347  }
348 
349  std::sort(nodes_random_id_sorted_increasing_.begin(),
350  nodes_random_id_sorted_increasing_.end(),
351  compare_node_randomid_decreasing());*/
352  }
353 
354  const std::vector<NodeDistanceIdRandomIdData>* GetNodesDistribution()
355  const {
356  return &nodes_random_id_sorted_increasing_;
357  }
360  NodeSketch* GetNodeSketch(NodeDistanceIdRandomIdData node_details) {
361  return GetNodeSketch(node_details.GetDetails());
362  }
363 
366  const std::vector<RandomId>* GetNodesDistributionLean() const {
367  return &nodes_random_id_;
368  }
371  RandomId GetNodeRandomId(const int& node_id) {
372  if ((unsigned int)node_id > nodes_random_id_.size()) {
373  return 2;
374  }
375  return nodes_random_id_[node_id];
376  }
380  void SetNodesDistribution(const std::vector<RandomId>* nodes_random_id) {
381  nodes_random_id_.clear();
382  nodes_random_id_.resize(nodes_random_id->size());
383  std::copy(nodes_random_id->begin(), nodes_random_id->end(),
384  nodes_random_id_.begin());
385  ExtractSortedVersionFromDist();
386  SetDisributionToNodes();
387  }
388 
389  NodesSketchItr Begin() {
390  return nodes_ads_.begin();
391  }
392 
393  NodesSketchItr End() {
394  return nodes_ads_.end();
395  }
396 
397  private:
398  void SetDisributionToNodes() {
399  for (int i=0; i < nodes_ads_.size(); i++) {
400  nodes_ads_[i].SetDisribution(&nodes_random_id_);
401  }
402  }
403  void ExtractSortedVersionFromDist() {
404  nodes_random_id_sorted_increasing_.clear();
405  for (unsigned int i = 0; i < nodes_random_id_.size(); i++) {
406  NodeDistanceIdRandomIdData b(0, i, nodes_random_id_[i]);
407  nodes_random_id_sorted_increasing_.push_back(b);
408  }
409 
410  std::sort(nodes_random_id_sorted_increasing_.begin(),
411  nodes_random_id_sorted_increasing_.end(),
412  compare_node_randomid_decreasing());
413  }
414 
415  bool should_calc_z_value_;
416  unsigned int reserve_size_;
417  unsigned int K_;
418  NodesSketch nodes_ads_;
419  std::vector<PrunningThreshold> prunning_thresholds_;
420  std::vector<RandomId> nodes_random_id_;
421  std::vector<NodeDistanceIdRandomIdData> nodes_random_id_sorted_increasing_;
422 };
427 } // namespace all_distance_sketch
428 
429 #endif // SRC_SKETCH_GRAPH_SKETCH_H_
void InitGraphSketch(unsigned int K, int max_node_id, const std::vector< int > *nodes_id=NULL)
Init the class.
Definition: graph_sketch.h:28
Single node sketch.
Definition: node_sketch.h:204
Definition: common.h:53
NodesSketch::iterator NodesSketchItr
Definition: graph_sketch.h:14
std::vector< NodeIdDistanceData > NodeIdDistanceVector
Container for node id and distance.
Definition: node_sketch.h:193
void set_should_calc_zvalues(bool should_calc)
if true when calculating the sketch we will also calculate the insertion probabilities of each node...
Definition: graph_sketch.h:41
void SetNodesDistribution(const std::vector< RandomId > *nodes_random_id)
sets the random id for each node. The default distribution is unifrom [0,1]
Definition: graph_sketch.h:380
const std::vector< RandomId > * GetNodesDistributionLean() const
returns the random id that was given to each node
Definition: graph_sketch.h:366
RandomId GetNodeRandomId(const int &node_id)
returns specific node random id
Definition: graph_sketch.h:371
std::vector< NodeSketch > NodesSketch
Definition: graph_sketch.h:11
Data structure for the graph sketch.
Definition: graph_sketch.h:17
void SaveGraphSketchToGpb(AllDistanceSketchGpb *all_distance_sketch)
Save the graph sketch to a Gpb class.
Definition: graph_sketch.h:75
double RandomId
Type for random ids.
Definition: node_sketch.h:14
bool LoadGraphSketchFromGpb(const AllDistanceSketchGpb &all_distance_sketch)
Load the graph sketch from a Gpb class.
Definition: graph_sketch.h:48
Container class to store node id and distance.
Definition: node_sketch.h:26