(* Copyright (C) 1992, Digital Equipment Corporation                         *)
(* All rights reserved.                                                      *)
(* See the file COPYRIGHT for a full description.                            *)

(* Created by stolfi on Thu Sep  6  0:58:48 PDT 1990           *)
(* Last modified on Sat Jun 27 15:46:01 PDT 1992 by muller     *)
(*      modified on Thu Sep 19 20:38:00 1991 by kalsow     *)
(*      modified on Tue May 14 13:01:28 PDT 1991 by stolfi     *)

MODULE TextSet;

IMPORT TextF, List, Thread, Word;

CONST
  NullKey = "";        (* Marks pristine entries *)
  DeletedKey = "-DEL-";  (* Marks deleted entries *)

CONST

  NullEntry = Entry{NullKey};

TYPE
  Entry = RECORD  
      key: Key; 
    END;

  Buckets = REF ARRAY OF Entry;

  TableRec = RECORD
      mutex: MUTEX;
      buckets: Buckets;           (* The bucket area *)
      nullKeyPresent: BOOLEAN;    (* TRUE iff NullKey is in set *)
      deletedKeyPresent: BOOLEAN; (* TRUE iff DeletedKey is in set *)

      (* Current statistics: *)
      avgProbes: REAL;            (* Running average of probes per operation *)
      numFilled: CARDINAL;        (* Current number of really occupied slots *)
      numDeleted: CARDINAL;       (* Current number of DeletedKey entries *)

    END;

TYPE 
  TT = T BRANDED OBJECT 
      rec: TableRec
    OVERRIDES
      in := In; 

      inChars := InChars; 

      put := Put;
      delete := Delete;
      clear := Clear;
      copy := Copy;

      toList := ToKeyList;

      enumerate := Enumerate;

    END;

CONST 
  StepMultiplier = 1052823;  (* Multiplier for step hashing *)

CONST
  (* Thresholds for rehashing the set: *)
  MaxAvgProbes = 4.0;        (* Max average probes/query *)
  MaxDeletedEntries = 0.75;  (* Max deleted entries in set *)
  MinExpansion = 1.25;       (* Minimum expansion factor when rehasing *)

CONST Decay = 0.0625; (* decay for running average of number of probes *)

CONST 
  standardSize = ARRAY OF CARDINAL{
          23,
          89,
         181,
         359,
         719,
        1447,
        2887,
        4093,
        5791,
        8191,
       11579,
       16381,
       23167,
       32749,
       46337,
       65521,
       92681,
      131071,
      185363,
      262139,
      370723,
      524287,
      741431,
     1048573,
     1482907,
     2097143,
     2965819,
     4194301,
     5931641,
     8388593,
    11863279,
    16777213,
    23726561,
    33554393,
    47453111,
    67108859,
    94906249,
   134217689  (* Should be enough...8-) *)
  };
  (*
    Standard set sizes. They must all be prime, and increase in
    roughly geometric progression. The smaller sizes increase faster
    to offset the allocation overhead. (I hope). *)

CONST
  MaxBuckets = standardSize[LAST(standardSize)];
  MinBuckets = standardSize[FIRST(standardSize)];

PROCEDURE New(

    initialSize: CARDINAL := 1;
  ): T RAISES {} =
  VAR set: TT := NEW(TT);
  BEGIN
    WITH t = set.rec DO
      t.mutex := NEW (Thread.Mutex);
      t.buckets := NewBuckets(initialSize);

      t.nullKeyPresent := FALSE;
      t.deletedKeyPresent := FALSE;
      t.avgProbes := 0.0;
      t.numFilled := 0;
      t.numDeleted := 0;

    END;
    RETURN set
  END New;

PROCEDURE TextEqual(a, b: TEXT): BOOLEAN RAISES {} =
  BEGIN
    IF a = b THEN
      RETURN TRUE
    ELSE
      RETURN (a^ = b^);
    END;
  END TextEqual;

PROCEDURE CharsEqual(READONLY a: CHARS; b: TEXT): BOOLEAN RAISES {} =
  BEGIN
    WITH bb = b^ DO
      RETURN (a = SUBARRAY(bb, 0, NUMBER (bb) - 1));
    END
  END CharsEqual;

PROCEDURE TextHash(key: Key): Word.T =
  CONST M = -1640531527;
  VAR r := 0;
  BEGIN
    WITH a = key^ DO
      FOR i := 0 TO LAST(a) - 1 DO
        r := Word.Plus(ORD(a[i]), Word.Times(M, r))
      END;
      RETURN r
    END;
  END TextHash;

PROCEDURE CharsHash(READONLY a: CHARS): Word.T =
  CONST M = -1640531527;
  VAR r := 0;
  BEGIN
    FOR i := 0 TO LAST(a) DO
      r := Word.Plus(ORD(a[i]), Word.Times(M, r))
    END;
    RETURN r
  END CharsHash;

PROCEDURE NewBuckets(size: CARDINAL): Buckets RAISES {} =
  VAR i: CARDINAL;
  BEGIN
    (* Round /size/ to next largest standard size: *)
    i := 0;
    WHILE (i < LAST(standardSize)) AND (size > standardSize[i]) DO INC(i) END;
    size := standardSize[i];
    WITH rb = NEW(Buckets, size), buck = rb^ DO
      FOR i := 0 TO LAST(buck) DO buck[i] := NullEntry END;
      RETURN rb
    END
  END NewBuckets;

PROCEDURE Put(
    set: TT; 
    key: Key; 
  ): BOOLEAN RAISES {} =
  VAR probes: CARDINAL;
      found, crowded: BOOLEAN;
  BEGIN
    WITH t = set.rec DO
      LOCK t.mutex DO

        (* Special keys: *)
        IF TextEqual(key, NullKey) THEN
          IF  t.nullKeyPresent THEN
            RETURN TRUE
          ELSE
            t.nullKeyPresent := TRUE; RETURN FALSE
          END;
        ELSIF TextEqual(key, DeletedKey) THEN
          IF t.deletedKeyPresent THEN
            RETURN TRUE
          ELSE
            t.deletedKeyPresent := TRUE; RETURN FALSE
          END
        END;

        (* General case: *)
        WITH hash = TextHash(key) DO
          LOOP
            found := DoPut(
              t, t.buckets^, hash, 
              key, 
              (*OUT*) probes, crowded
            );

            IF NOT crowded THEN
              t.avgProbes := (1.0 - Decay) * t.avgProbes + Decay * FLOAT(probes);

              IF (t.avgProbes >       (MaxAvgProbes)) 
              AND (NUMBER(t.buckets^) < MaxBuckets) THEN
                AvgProbesTooHigh(t, probes);
              END;
              RETURN found
            END;
            (* T is too crowded to insert this item: *)

            Rehash(t, expand := TRUE)
          END;
        END
      END
    END;
  END Put;

PROCEDURE DoPut(
    VAR t: TableRec;
    VAR buck: ARRAY OF Entry; 
    hash: Word.T;
    key: Key;
    VAR (*OUT*) probes: CARDINAL;
    VAR (*OUT*) crowded: BOOLEAN;
  ): BOOLEAN RAISES {} =
  (* Stores data in bucket array, returns TRUE iff key was already
     present.  Sets /crowded/ to TRUE (and returns FALSE) if put failed
     because set was too crowded/unbalanced *)
  VAR 
    i, free: CARDINAL;
  BEGIN
    WITH 
      M = NUMBER(buck),
      chainBase = Word.Mod(hash, M),
      chainStep = 1 + Word.Mod(Word.Times(hash, StepMultiplier), M - 1)
    DO
      i := chainBase;
      probes := 0;
      free := M;
      LOOP
        INC(probes);
        WITH entry = buck[i] DO
          (* Examine entry *)
          IF (entry.key = NullKey) THEN 
            (* Reached an empty slot: Key is not there *)
            IF free # M THEN
              buck[free].key := key; 
              DEC(t.numDeleted)
            ELSE
              entry.key := key; 
            END;
            INC(t.numFilled);
            crowded := FALSE;
            RETURN FALSE 
          ELSIF (entry.key = DeletedKey) THEN
            (* Deleted slot --- treat as a wrong key, but remember it *)
            IF free = M THEN free := i END;
          ELSIF TextEqual(entry.key, key) THEN 
            (* Key is already there: *)
            crowded := FALSE;
            RETURN TRUE
          END;
          (* Should we go on? *)
          IF probes >= M THEN
            (* Reached end of list, key is not there *)
            IF free = M THEN
              (* No space to insert *)
              crowded := TRUE
            ELSE
              (* Insert in lieu of deleted slot *)
              buck[free].key := key; 
              DEC(t.numDeleted);
              INC(t.numFilled);
              crowded := FALSE
            END;
            RETURN FALSE
          END
        END;
        i := (i + chainStep) MOD M
      END
    END;
  END DoPut;

PROCEDURE In(
    set: TT; 
    key: Key;
  ): BOOLEAN RAISES {} =
  BEGIN 
    <* ASSERT key # NIL *>

    RETURN InChars(set, SUBARRAY(key^, 0, NUMBER(key^)-1));

  END In;

PROCEDURE InChars(
    set: TT; 
    READONLY key: CHARS;
  ): BOOLEAN RAISES {} =

  VAR 
    probes: CARDINAL;
    found: BOOLEAN;
  BEGIN
    WITH t = set.rec DO
      LOCK t.mutex DO

        (* Special keys: *)

        IF CharsEqual(key, NullKey) THEN

          IF t.nullKeyPresent THEN
            RETURN TRUE
          ELSE
            RETURN FALSE
          END;

        ELSIF CharsEqual(key, DeletedKey) THEN

          IF t.deletedKeyPresent THEN
            RETURN TRUE
          ELSE
            RETURN FALSE
          END;
        END;

        (* General case: *)

        WITH hash = CharsHash(key) DO

          found := DoIn(
            t, t.buckets^, hash, 
            key, 
            (*OUT*) probes
          );

          t.avgProbes := (1.0 - Decay) * t.avgProbes + Decay * FLOAT(probes);

          IF (t.avgProbes >       (MaxAvgProbes)) 
          AND (NUMBER(t.buckets^) < MaxBuckets) THEN
            AvgProbesTooHigh(t, probes);
          END;
          RETURN found
        END
      END
    END

  END InChars;

PROCEDURE DoIn(
    <*UNUSED*> VAR t: TableRec;
    VAR buck: ARRAY OF Entry; 
    hash: Word.T;

    READONLY key: CHARS; 

    VAR (*OUT*) probes: CARDINAL;
  ): BOOLEAN RAISES {} =
  (* Locates key in bucket array, 
       returns TRUE;
     If not found, returns FALSE. *)
  VAR 
    i, free: CARDINAL;
  BEGIN
    WITH 
      M = NUMBER(buck),
      chainBase = Word.Mod(hash, M),
      chainStep = 1 + Word.Mod(Word.Times(hash, StepMultiplier), M - 1)
    DO
      i := chainBase;
      probes := 0;
      free := M;
      LOOP
        INC(probes);
        WITH entry = buck[i] DO
          IF (entry.key = NullKey) THEN 
            (* Reached end of chain, didn't find key *)
            RETURN FALSE
          ELSIF (entry.key = DeletedKey) THEN
            (* Treat as wrong key, but remember it *)
            IF free = M THEN free := i END;

          ELSIF CharsEqual(key, entry.key) THEN 

            IF free # M THEN
              (* Relocate entry to deleted slot: *)
              buck[free] := entry;
              entry.key := DeletedKey;
            END;
            RETURN TRUE
          ELSIF probes >= M THEN
            (* Reached end of chain, didn't find key *)
            RETURN FALSE
          END
        END;
        i := (i + chainStep) MOD M
      END
    END;
  END DoIn;

PROCEDURE Delete(
    set: TT; 
    key: Key; 
  ): BOOLEAN RAISES {} =
  VAR 
    probes: CARDINAL;
    found: BOOLEAN;
  BEGIN
    WITH t = set.rec DO
      LOCK t.mutex DO

        (* Special keys: *)
        IF TextEqual(key, NullKey) THEN
          IF t.nullKeyPresent THEN
            t.nullKeyPresent := FALSE;
            RETURN TRUE
          ELSE
            RETURN FALSE
          END;
        ELSIF TextEqual(key, DeletedKey) THEN
          IF t.deletedKeyPresent THEN
            t.deletedKeyPresent := FALSE;
            RETURN TRUE
          ELSE
            RETURN FALSE
          END;
        END;

        (* General case: *)
        WITH hash = TextHash(key) DO
          found := DoDelete(
            t, t.buckets^, hash, 
            key, 
            (*OUT*) probes
          );

          t.avgProbes := (1.0 - Decay) * t.avgProbes + Decay * FLOAT(probes);

          IF (t.avgProbes >       (MaxAvgProbes)) AND 
          (NUMBER(t.buckets^) < MaxBuckets) THEN
            AvgProbesTooHigh(t, probes);
          END;
          IF found THEN 
            WITH M = NUMBER(t.buckets^) DO
              IF (FLOAT(t.numDeleted) >  (MaxDeletedEntries) * FLOAT(M))
              AND M > MinBuckets THEN

                Rehash(t, expand := FALSE)
              END
            END
          END;
          RETURN found
        END
      END
    END
  END Delete;

PROCEDURE DoDelete(
    VAR t: TableRec;
    VAR buck: ARRAY OF Entry; 
    hash: Word.T;
    key: Key; 
    VAR (*OUT*) probes: CARDINAL;
  ): BOOLEAN RAISES {} =
  (* Locates key in bucket array; 
      if found, 
        replaces entry's key by DeletedKey, 
        and return TRUE;
      if not found,
        and returns FALSE.
    *)
  VAR 
    i: CARDINAL;
  BEGIN
    WITH 
      M = NUMBER(buck),
      chainBase = Word.Mod(hash, M),
      chainStep = 1 + Word.Mod(Word.Times(hash, StepMultiplier), M - 1)
    DO
      i := chainBase;
      probes := 0;
      LOOP
        INC(probes);
        WITH entry = buck[i] DO
          IF (entry.key = NullKey) THEN 
            RETURN FALSE
          ELSIF (NOT (entry.key = DeletedKey))
          AND TextEqual(entry.key, key) THEN 
            entry.key := DeletedKey;
            INC(t.numDeleted);
            DEC(t.numFilled);
            RETURN TRUE
          ELSIF probes >= M THEN
            RETURN FALSE
          END
        END;
        i := (i + chainStep) MOD M
      END
    END;
  END DoDelete;

PROCEDURE AvgProbesTooHigh(VAR t: TableRec; <*UNUSED*> probes: CARDINAL) RAISES {} =
  (* Expands t.buckets because of too many probes/operation *)
  BEGIN

    IF FLOAT(t.numFilled) > FLOAT(NUMBER(t.buckets^))*(1.0- (MaxDeletedEntries)) THEN
      Rehash(t, expand := TRUE)
    ELSE
      (* Table already too big to expand; reset average probes, and hope for the best *)
      t.avgProbes := 0.0
    END
  END AvgProbesTooHigh;

PROCEDURE Rehash(
    VAR t: TableRec;
    expand: BOOLEAN;  (* TRUE if called because set is too crowded *)
  ) RAISES {} =
  VAR oldBuckets, newBuckets: Buckets;
      size: CARDINAL;
      success: BOOLEAN;
  BEGIN

    REPEAT
      (* Create a new bucket array of sufficient size, and swap bucket arrays: *)
      oldBuckets := t.buckets;
      WITH 
        idealOccupancy = (1.0 - 1.0/FLOAT(      (MaxAvgProbes)))/1.5,
        idealNewSize = ROUND(FLOAT(t.numFilled) / idealOccupancy)
      DO

        size := idealNewSize
      END;
      IF expand THEN
        WITH 
          effectiveSize = NUMBER(oldBuckets^) - t.numDeleted,
          minNewSize = ROUND(FLOAT(effectiveSize) *       (MinExpansion))
        DO

          size := MAX(size, minNewSize);
          <* ASSERT size > NUMBER(oldBuckets^) - t.numDeleted *>
        END
      END;
      newBuckets := NewBuckets(size);
      <* ASSERT NUMBER(newBuckets^) >= t.numFilled *>
      
      t.buckets := newBuckets;
      t.numDeleted := 0;
      t.numFilled := 0;
      expand := TRUE; (* Second and later attempts should expand *)

      (* Move old entries to new bucket array, and clean old one:  *)
      success := CopyEntries(t, oldBuckets^, newBuckets^);
      
      (* Discard old bucket array: *)
      oldBuckets := NIL;
      (* Make sure that second and later attempts really expand set: *)
      expand := TRUE;
    UNTIL success;

    (* Reset running probe average to expected average: *)
    WITH M = NUMBER(t.buckets^) DO
      t.avgProbes := FLOAT(M + 1)/FLOAT(M + 1 - t.numFilled)
    END;

  END Rehash;

PROCEDURE CopyEntries(
    VAR t: TableRec;
    VAR (*IO*) old, new: ARRAY OF Entry;
  ): BOOLEAN RAISES {} =
  (* Moves entries from /old/ buckets to the /new/ ones, in hashing order if possible.
     Assumes /new/ is empty to begin with, and ensures that /old/ is
     clean upon return.  Returns TRUE if all elements were hashed properly, 
     FALSE otherwise. *)
  VAR probes, next: CARDINAL;
      found, crowded: BOOLEAN;
  BEGIN
    crowded := FALSE;
    next := 0;
    (* Ignore number of probes, except if it exceeds the max probes per item: *)
    FOR i := 0 TO LAST(old) DO
      WITH e = old[i] DO
        IF (e.key = NullKey) THEN
          (* Ignore *)
        ELSIF (e.key = DeletedKey) THEN
          e.key := NullKey
        ELSE
          IF crowded THEN
            (* Insert into first free slot: *)
            WHILE NOT (new[next].key = NullKey) DO INC(next) END;
            new[next] := e; INC(next)
          ELSE
            (* Try hashing into new set. *)
            (* Do not worry about the average number of probes yet. *)
            WITH hash = TextHash(e.key) DO
              found := DoPut(
                t, new, hash, 
                e.key, 
                (*OUT*) probes, crowded
              );

            END;
          END;
          e := NullEntry
        END;
      END;
    END;
    RETURN NOT crowded
  END CopyEntries;

PROCEDURE Clear(set: TT) RAISES {} =
  BEGIN
    WITH t = set.rec DO
      LOCK t.mutex DO
        t.nullKeyPresent := FALSE;
        t.deletedKeyPresent := FALSE;
        WITH buck = t.buckets^ DO
          FOR i := 0 TO LAST(buck) DO 
            WITH e = buck[i] DO 
              IF NOT (e.key = NullKey) THEN e := NullEntry END
            END
          END;
        END;
        t.avgProbes := 0.0;
        t.numDeleted := 0;
        t.numFilled := 0;
        

      END
    END
  END Clear;

PROCEDURE Copy(set: TT): T RAISES {} =
  BEGIN
    WITH 
      copy = NEW(TT),
      c = copy.rec,
      t = set.rec
    DO
      LOCK t.mutex DO
        c := t;
        c.mutex := NEW (Thread.Mutex);
        c.buckets := NEW(Buckets, NUMBER(t.buckets^));
        c.buckets^ := t.buckets^; 
        RETURN copy
      END
    END
  END Copy;

PROCEDURE ToKeyList(set: TT): List.T RAISES {} =
  VAR list: List.T;
      refKey: TEXT;
  BEGIN
    WITH t = set.rec DO
      LOCK t.mutex DO
        list := NIL;
        IF t.nullKeyPresent THEN
          refKey := NullKey;
          list := List.New(refKey, list)
        END;
        IF t.deletedKeyPresent THEN
          refKey := DeletedKey;
          list := List.New(refKey, list)
        END;
        WITH buck = t.buckets^ DO
          FOR i := 0 TO LAST(buck) DO
            WITH e = buck[i] DO
              IF NOT (e.key = NullKey) 
              AND NOT (e.key = DeletedKey) THEN
                refKey := e.key;
                list := List.New(refKey, list)
              END
            END
          END
        END;
        RETURN list;
      END
    END
  END ToKeyList;

PROCEDURE Enumerate(
    set: TT; 
    proc: EnumerateProc; 
    data: REFANY; 
    VAR (*OUT*) key: Key; 
  ): BOOLEAN =
  VAR 
  BEGIN
    WITH t = set.rec DO
      LOCK t.mutex DO
        IF t.nullKeyPresent THEN
          key := NullKey;

          IF proc(data, key) THEN RETURN TRUE END;

        END;
        IF t.deletedKeyPresent THEN
          key := DeletedKey;

          IF proc(data, key) THEN RETURN TRUE END;

        END;
        WITH buck = t.buckets^ DO
          FOR i := 0 TO LAST(buck) DO
            WITH e = buck[i] DO
              IF NOT (e.key = NullKey) 
              AND NOT (e.key = DeletedKey) THEN
                key := e.key;

                IF proc(data, key) THEN RETURN TRUE END;

              END
            END
          END
        END;
        key := NullKey;
        RETURN FALSE;
      END
    END
  END Enumerate;

BEGIN
END TextSet.
