doc/clminfo2.azm

\import{mcx.zmm}

\begin{pud::man}{

   {name}{clm info2}
   {html_title}{The clm info2 manual}
   {author}{Stijn van Dongen}
   {section}{1}
   {synstyle}{long}
   {defstyle}{long}

   \man_share
}

\${html}{\"pud::man::maketoc"}

\sec{name}{NAME}
\NAME{clm info2}{compute performance measures for graphs and clusterings.}

\disclaim_clm{info2}

\sec{synopsis}{SYNOPSIS}
\par{
   \clm{info2} [options] <graph file> <cluster file> <cluster file>*
   }

\par{
   \clm{info2}
      \synoptopt{-o}{fname}{write to file \bf{fname}}
      \synoptopt{-pi}{f}{apply inflation beforehand}
      \synoptopt{--list}{list efficiency for all nodes}
      \shared_synoptopt{-tf}
      \synoptopt{-cl-ceil}{<num>}{skip clusters of size exceeding \bf{<num>}}
      \synoptopt{-cat-max}{<num>}{do at most \bf{<num>} tree levels}
      \synoptopt{-cl-tree}{fname}{expect file with nested clusterings}
      \shared_synoptopt{-t}
      \shared_synoptopt{-J}
      \shared_synoptopt{-j}
      \stdsynopt
      <matrix file> <cluster file> <cluster file>*
   }

\sec{description}{DESCRIPTION}

\par{
   \clm{info2} is a streamlined and updated version of \clm{info}. The
   latter outputs a key-value format listing a number of measures. In contrast,
   \clm{info2} only outputs the so-called efficiency criterion, a quality
   index for networks and clusterings. This criterion can be generated for
   each node independently with the \genopt{--list} option, indicating how
   well a clustering captures the neighbour distribution of a given node.
   }

\par{
   \clm{info2} can utilise threading and job dispatching. This may be useful
   when dealing with very large graphs.
   }

\par{
   Multiple clusterings can be supplied on the command-line.
   Output is tabular, each row corresponding with a clustering in the
   ordering as supplied on the command line. Multiple columns will
   result only if node-wise output is induced with \genopt{--list}.
   By default a single number is produced for each individual clustering:
   the mean of all node-wise scores for that clustering.
   }

\par{
   The \bf{efficiency} factor is described in [1] (see
   the \secref{references} section).  It tries to balance the dual aims of
   capturing a lot of edges or edge weights and keeping the cluster footprint
   or area fraction small.  The efficiency number has several appealing
   mathematical properties, cf. [1].}

\sec{}{OPTIONS}

\'begin{itemize}{\mcx_itemopts}

\item{\defopt{-o}{fname}{output file name}}

\item{\defopt{-pi}{f}{apply inflation beforehand}}
\car{
   Apply inflation to the graph matrix and compute the performance
   measures for the result.}

\shared_itemopt{-tf}
\car{shared_defopt{-tf}}

\item{\defopt{--list}{list efficiency for all nodes}}
\car{
   The efficiency scores for all nodes are given on a single line.
   Each clustering specified corresponds to a single line.
   }


\items{
   {\defopt{-cl-tree}{fname}{expect file with nested clusterings (cone format)}}
   {\defopt{-cl-ceil}{<num>}{skip (nested) clusters of size exceeding <num>}}
}
\car{
   The specified file should contain a hierarchy of nested
   clusterings such as generated by \mclcm. The output is then
   in a special format, undocumented but easy to understand.
   Its purpose is to help cherrypick a single clustering
   from a tree, in conjunction with the slightly experimental
   and undocumented program \bf{mlmfifofum}.
   }

\par{
   The measure that is used is very slow to compute for large clusters, and
   generally it will be outside any interesting range (i.e. it will be small).
   Use \genopt{-cl-ceil} to skip clusters exceeding the specified size \-
   \clminfo will directly proceed to subclusters if they exist.
   }

\item{\defopt{-cat-max}{num}{do at most num levels}}
\car{
   This only has effect when used with \genopt{-cl-tree}.
   \clm{info} will start at the most fine-grained level, working upwards.
   }


\items{
   {\defopt{-t}{<int>}{use <int> threads}}
   {\defopt{-j}{<intj>}{this job has index <intj>}}
   {\defopt{-J}{<intJ>}{a total of <intJ> jobs are used}}
}
\car{
   For very large graphs (millions of nodes) and clusterings with large
   clusters it may be helpful to allow this program to use multiple CPUs.
   Additionally it is possible to spread the computation over multiple
   jobs/machines.  These three options are described in the \sibref{clmprotocols} manual page.
   The following set of options, if given to as many commands, defines three jobs, each running four threads.
   }

\verbatim{-t 4 -J 3 -j 0 -o out.0
-t 4 -J 3 -j 1 -o out.1
-t 4 -J 3 -j 2 -o out.2}

\car{
   The output can then be collected with
   }

\verbatim{clxdo add_table out.[0-2]}

\stddefopt

\end{itemize}


\sec{author}{AUTHOR}
\par{
   Stijn van Dongen.}

\sec{seealso}{SEE ALSO}
\par{
   \mysib{mclfamily} for an overview of all the documentation
   and the utilities in the mcl family.}

\sec{references}{REFERENCES}

\par{
   [1] Stijn van Dongen. \it{Performance criteria for graph clustering and Markov
   cluster experiments}.  Technical Report INS-R0012, National Research
   Institute for Mathematics and Computer Science in the Netherlands,
   Amsterdam, May 2000.\|
   \httpref{http://www.cwi.nl/ftp/CWIreports/INS/INS-R0012.ps.Z}}

\end{pud::man}